1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c 6 7 8define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 { 9; X86-LABEL: test_mm512_kunpackb: 10; X86: # %bb.0: # %entry 11; X86-NEXT: pushl %ebp 12; X86-NEXT: .cfi_def_cfa_offset 8 13; X86-NEXT: .cfi_offset %ebp, -8 14; X86-NEXT: movl %esp, %ebp 15; X86-NEXT: .cfi_def_cfa_register %ebp 16; X86-NEXT: andl $-64, %esp 17; X86-NEXT: subl $64, %esp 18; X86-NEXT: vmovdqa64 136(%ebp), %zmm3 19; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 20; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 21; X86-NEXT: kunpckbw %k0, %k1, %k1 22; X86-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1} 23; X86-NEXT: kmovw %k0, %eax 24; X86-NEXT: movzwl %ax, %eax 25; X86-NEXT: movl %ebp, %esp 26; X86-NEXT: popl %ebp 27; X86-NEXT: .cfi_def_cfa %esp, 4 28; X86-NEXT: vzeroupper 29; X86-NEXT: retl 30; 31; X64-LABEL: test_mm512_kunpackb: 32; X64: # %bb.0: # %entry 33; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 34; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 35; X64-NEXT: kunpckbw %k0, %k1, %k1 36; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1} 37; X64-NEXT: kmovw %k0, %eax 38; X64-NEXT: movzwl %ax, %eax 39; X64-NEXT: vzeroupper 40; X64-NEXT: retq 41entry: 42 %0 = bitcast <8 x i64> %__E to <16 x i32> 43 %1 = bitcast <8 x i64> %__F to <16 x i32> 44 %2 = bitcast <8 x i64> %__A to <16 x i32> 45 %3 = bitcast <8 x i64> %__B to <16 x i32> 46 %4 = icmp ne <16 x i32> %2, %3 47 %5 = bitcast <8 x i64> %__C to <16 x i32> 48 %6 = bitcast <8 x i64> %__D to <16 x i32> 49 %7 = icmp ne <16 x i32> %5, %6 50 %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 51 %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 52 %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 53 %11 = icmp ne <16 x i32> %0, %1 54 %12 = and <16 x i1> %11, %10 55 %13 = bitcast <16 x i1> %12 to i16 56 ret i16 %13 57} 58 59define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) { 60; X86-LABEL: test_mm512_kortestc: 61; X86: # %bb.0: # %entry 62; X86-NEXT: pushl %ebp 63; X86-NEXT: .cfi_def_cfa_offset 8 64; X86-NEXT: .cfi_offset %ebp, -8 65; X86-NEXT: movl %esp, %ebp 66; X86-NEXT: .cfi_def_cfa_register %ebp 67; X86-NEXT: andl $-64, %esp 68; X86-NEXT: subl $64, %esp 69; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 70; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 71; X86-NEXT: korw %k0, %k1, %k0 72; X86-NEXT: kmovw %k0, %eax 73; X86-NEXT: cmpw $-1, %ax 74; X86-NEXT: sete %al 75; X86-NEXT: andb $1, %al 76; X86-NEXT: movzbl %al, %eax 77; X86-NEXT: movl %ebp, %esp 78; X86-NEXT: popl %ebp 79; X86-NEXT: .cfi_def_cfa %esp, 4 80; X86-NEXT: vzeroupper 81; X86-NEXT: retl 82; 83; X64-LABEL: test_mm512_kortestc: 84; X64: # %bb.0: # %entry 85; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 86; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 87; X64-NEXT: korw %k0, %k1, %k0 88; X64-NEXT: kmovw %k0, %eax 89; X64-NEXT: cmpw $-1, %ax 90; X64-NEXT: sete %al 91; X64-NEXT: andb $1, %al 92; X64-NEXT: movzbl %al, %eax 93; X64-NEXT: vzeroupper 94; X64-NEXT: retq 95entry: 96 %0 = bitcast <8 x i64> %__A to <16 x i32> 97 %1 = bitcast <8 x i64> %__B to <16 x i32> 98 %2 = icmp ne <16 x i32> %0, %1 99 %3 = bitcast <8 x i64> %__C to <16 x i32> 100 %4 = bitcast <8 x i64> %__D to <16 x i32> 101 %5 = icmp ne <16 x i32> %3, %4 102 %6 = or <16 x i1> %5, %2 %7 = bitcast <16 x i1> %6 to i16 103 %8 = icmp eq i16 %7, -1 104 %9 = zext i1 %8 to i32 105 ret i32 %9 106} 107 108define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) { 109; X86-LABEL: test_mm512_kortestz: 110; X86: # %bb.0: # %entry 111; X86-NEXT: pushl %ebp 112; X86-NEXT: .cfi_def_cfa_offset 8 113; X86-NEXT: .cfi_offset %ebp, -8 114; X86-NEXT: movl %esp, %ebp 115; X86-NEXT: .cfi_def_cfa_register %ebp 116; X86-NEXT: andl $-64, %esp 117; X86-NEXT: subl $64, %esp 118; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 119; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 120; X86-NEXT: korw %k0, %k1, %k0 121; X86-NEXT: kmovw %k0, %eax 122; X86-NEXT: cmpw $0, %ax 123; X86-NEXT: sete %al 124; X86-NEXT: andb $1, %al 125; X86-NEXT: movzbl %al, %eax 126; X86-NEXT: movl %ebp, %esp 127; X86-NEXT: popl %ebp 128; X86-NEXT: .cfi_def_cfa %esp, 4 129; X86-NEXT: vzeroupper 130; X86-NEXT: retl 131; 132; X64-LABEL: test_mm512_kortestz: 133; X64: # %bb.0: # %entry 134; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 135; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 136; X64-NEXT: korw %k0, %k1, %k0 137; X64-NEXT: kmovw %k0, %eax 138; X64-NEXT: cmpw $0, %ax 139; X64-NEXT: sete %al 140; X64-NEXT: andb $1, %al 141; X64-NEXT: movzbl %al, %eax 142; X64-NEXT: vzeroupper 143; X64-NEXT: retq 144entry: 145 %0 = bitcast <8 x i64> %__A to <16 x i32> 146 %1 = bitcast <8 x i64> %__B to <16 x i32> 147 %2 = icmp ne <16 x i32> %0, %1 148 %3 = bitcast <8 x i64> %__C to <16 x i32> 149 %4 = bitcast <8 x i64> %__D to <16 x i32> 150 %5 = icmp ne <16 x i32> %3, %4 151 %6 = or <16 x i1> %5, %2 152 %7 = bitcast <16 x i1> %6 to i16 153 %8 = icmp eq i16 %7, 0 154 %9 = zext i1 %8 to i32 155 ret i32 %9 156} 157 158define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) { 159; CHECK-LABEL: test_mm512_shuffle_f32x4: 160; CHECK: # %bb.0: # %entry 161; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] 162; CHECK-NEXT: ret{{[l|q]}} 163entry: 164 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 165 ret <16 x float> %shuffle 166} 167 168 169define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 170; X86-LABEL: test_mm512_mask_shuffle_f32x4: 171; X86: # %bb.0: # %entry 172; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 173; X86-NEXT: kmovw %eax, %k1 174; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 175; X86-NEXT: retl 176; 177; X64-LABEL: test_mm512_mask_shuffle_f32x4: 178; X64: # %bb.0: # %entry 179; X64-NEXT: kmovw %edi, %k1 180; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 181; X64-NEXT: retq 182entry: 183 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 184 %0 = bitcast i16 %__U to <16 x i1> 185 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W 186 ret <16 x float> %1 187} 188 189define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 190; X86-LABEL: test_mm512_maskz_shuffle_f32x4: 191; X86: # %bb.0: # %entry 192; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 193; X86-NEXT: kmovw %eax, %k1 194; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 195; X86-NEXT: retl 196; 197; X64-LABEL: test_mm512_maskz_shuffle_f32x4: 198; X64: # %bb.0: # %entry 199; X64-NEXT: kmovw %edi, %k1 200; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 201; X64-NEXT: retq 202entry: 203 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 204 %0 = bitcast i16 %__U to <16 x i1> 205 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer 206 ret <16 x float> %1 207} 208 209define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) { 210; CHECK-LABEL: test_mm512_shuffle_f64x2: 211; CHECK: # %bb.0: # %entry 212; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] 213; CHECK-NEXT: ret{{[l|q]}} 214entry: 215 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 216 ret <8 x double> %shuffle 217} 218 219define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 220; X86-LABEL: test_mm512_mask_shuffle_f64x2: 221; X86: # %bb.0: # %entry 222; X86-NEXT: movb {{[0-9]+}}(%esp), %al 223; X86-NEXT: kmovw %eax, %k1 224; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 225; X86-NEXT: retl 226; 227; X64-LABEL: test_mm512_mask_shuffle_f64x2: 228; X64: # %bb.0: # %entry 229; X64-NEXT: kmovw %edi, %k1 230; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 231; X64-NEXT: retq 232entry: 233 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 234 %0 = bitcast i8 %__U to <8 x i1> 235 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W 236 ret <8 x double> %1 237} 238 239define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 240; X86-LABEL: test_mm512_maskz_shuffle_f64x2: 241; X86: # %bb.0: # %entry 242; X86-NEXT: movb {{[0-9]+}}(%esp), %al 243; X86-NEXT: kmovw %eax, %k1 244; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 245; X86-NEXT: retl 246; 247; X64-LABEL: test_mm512_maskz_shuffle_f64x2: 248; X64: # %bb.0: # %entry 249; X64-NEXT: kmovw %edi, %k1 250; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 251; X64-NEXT: retq 252entry: 253 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 254 %0 = bitcast i8 %__U to <8 x i1> 255 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer 256 ret <8 x double> %1 257} 258 259define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 260; CHECK-LABEL: test_mm512_shuffle_i32x4: 261; CHECK: # %bb.0: # %entry 262; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] 263; CHECK-NEXT: ret{{[l|q]}} 264entry: 265 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 266 ret <8 x i64> %shuffle 267} 268 269define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 270; X86-LABEL: test_mm512_mask_shuffle_i32x4: 271; X86: # %bb.0: # %entry 272; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 273; X86-NEXT: kmovw %eax, %k1 274; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 275; X86-NEXT: retl 276; 277; X64-LABEL: test_mm512_mask_shuffle_i32x4: 278; X64: # %bb.0: # %entry 279; X64-NEXT: kmovw %edi, %k1 280; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 281; X64-NEXT: retq 282entry: 283 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 284 %0 = bitcast <8 x i64> %shuffle to <16 x i32> 285 %1 = bitcast <8 x i64> %__W to <16 x i32> 286 %2 = bitcast i16 %__U to <16 x i1> 287 %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1 288 %4 = bitcast <16 x i32> %3 to <8 x i64> 289 ret <8 x i64> %4 290} 291 292define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 293; X86-LABEL: test_mm512_maskz_shuffle_i32x4: 294; X86: # %bb.0: # %entry 295; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 296; X86-NEXT: kmovw %eax, %k1 297; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 298; X86-NEXT: retl 299; 300; X64-LABEL: test_mm512_maskz_shuffle_i32x4: 301; X64: # %bb.0: # %entry 302; X64-NEXT: kmovw %edi, %k1 303; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 304; X64-NEXT: retq 305entry: 306 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 307 %0 = bitcast <8 x i64> %shuffle to <16 x i32> 308 %1 = bitcast i16 %__U to <16 x i1> 309 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 310 %3 = bitcast <16 x i32> %2 to <8 x i64> 311 ret <8 x i64> %3 312} 313 314define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 315; CHECK-LABEL: test_mm512_shuffle_i64x2: 316; CHECK: # %bb.0: # %entry 317; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] 318; CHECK-NEXT: ret{{[l|q]}} 319entry: 320 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 321 ret <8 x i64> %shuffle 322} 323 324define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 325; X86-LABEL: test_mm512_mask_shuffle_i64x2: 326; X86: # %bb.0: # %entry 327; X86-NEXT: movb {{[0-9]+}}(%esp), %al 328; X86-NEXT: kmovw %eax, %k1 329; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 330; X86-NEXT: retl 331; 332; X64-LABEL: test_mm512_mask_shuffle_i64x2: 333; X64: # %bb.0: # %entry 334; X64-NEXT: kmovw %edi, %k1 335; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 336; X64-NEXT: retq 337entry: 338 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 339 %0 = bitcast i8 %__U to <8 x i1> 340 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W 341 ret <8 x i64> %1 342} 343 344define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 345; X86-LABEL: test_mm512_maskz_shuffle_i64x2: 346; X86: # %bb.0: # %entry 347; X86-NEXT: movb {{[0-9]+}}(%esp), %al 348; X86-NEXT: kmovw %eax, %k1 349; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 350; X86-NEXT: retl 351; 352; X64-LABEL: test_mm512_maskz_shuffle_i64x2: 353; X64: # %bb.0: # %entry 354; X64-NEXT: kmovw %edi, %k1 355; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 356; X64-NEXT: retq 357entry: 358 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 359 %0 = bitcast i8 %__U to <8 x i1> 360 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer 361 ret <8 x i64> %1 362} 363 364 365define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) { 366; CHECK-LABEL: test_mm512_testn_epi32_mask: 367; CHECK: # %bb.0: # %entry 368; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 369; CHECK-NEXT: kmovw %k0, %eax 370; CHECK-NEXT: movzwl %ax, %eax 371; CHECK-NEXT: vzeroupper 372; CHECK-NEXT: ret{{[l|q]}} 373entry: 374 %and1.i.i = and <8 x i64> %__B, %__A 375 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> 376 %1 = icmp eq <16 x i32> %0, zeroinitializer 377 %2 = bitcast <16 x i1> %1 to i16 378 ret i16 %2 379} 380 381define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 382; X86-LABEL: test_mm512_mask_testn_epi32_mask: 383; X86: # %bb.0: # %entry 384; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 385; X86-NEXT: kmovw %eax, %k1 386; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} 387; X86-NEXT: kmovw %k0, %eax 388; X86-NEXT: movzwl %ax, %eax 389; X86-NEXT: vzeroupper 390; X86-NEXT: retl 391; 392; X64-LABEL: test_mm512_mask_testn_epi32_mask: 393; X64: # %bb.0: # %entry 394; X64-NEXT: kmovw %edi, %k1 395; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} 396; X64-NEXT: kmovw %k0, %eax 397; X64-NEXT: movzwl %ax, %eax 398; X64-NEXT: vzeroupper 399; X64-NEXT: retq 400entry: 401 %and1.i.i = and <8 x i64> %__B, %__A 402 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> 403 %1 = icmp eq <16 x i32> %0, zeroinitializer 404 %2 = bitcast i16 %__U to <16 x i1> 405 %3 = and <16 x i1> %1, %2 406 %4 = bitcast <16 x i1> %3 to i16 407 ret i16 %4 408} 409 410define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) { 411; CHECK-LABEL: test_mm512_testn_epi64_mask: 412; CHECK: # %bb.0: # %entry 413; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0 414; CHECK-NEXT: kmovw %k0, %eax 415; CHECK-NEXT: movzbl %al, %eax 416; CHECK-NEXT: vzeroupper 417; CHECK-NEXT: ret{{[l|q]}} 418entry: 419 %and1.i.i = and <8 x i64> %__B, %__A 420 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer 421 %1 = bitcast <8 x i1> %0 to i8 422 ret i8 %1 423} 424 425define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 426; X86-LABEL: test_mm512_mask_testn_epi64_mask: 427; X86: # %bb.0: # %entry 428; X86-NEXT: movb {{[0-9]+}}(%esp), %al 429; X86-NEXT: kmovw %eax, %k1 430; X86-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1} 431; X86-NEXT: kmovw %k0, %eax 432; X86-NEXT: movzbl %al, %eax 433; X86-NEXT: vzeroupper 434; X86-NEXT: retl 435; 436; X64-LABEL: test_mm512_mask_testn_epi64_mask: 437; X64: # %bb.0: # %entry 438; X64-NEXT: kmovw %edi, %k1 439; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1} 440; X64-NEXT: kmovw %k0, %eax 441; X64-NEXT: movzbl %al, %eax 442; X64-NEXT: vzeroupper 443; X64-NEXT: retq 444entry: 445 %and1.i.i = and <8 x i64> %__B, %__A 446 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer 447 %1 = bitcast i8 %__U to <8 x i1> 448 %2 = and <8 x i1> %0, %1 449 %3 = bitcast <8 x i1> %2 to i8 450 ret i8 %3 451} 452 453define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 454; X86-LABEL: test_mm512_mask_test_epi32_mask: 455; X86: # %bb.0: # %entry 456; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 457; X86-NEXT: kmovw %eax, %k1 458; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} 459; X86-NEXT: kmovw %k0, %eax 460; X86-NEXT: movzwl %ax, %eax 461; X86-NEXT: vzeroupper 462; X86-NEXT: retl 463; 464; X64-LABEL: test_mm512_mask_test_epi32_mask: 465; X64: # %bb.0: # %entry 466; X64-NEXT: kmovw %edi, %k1 467; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} 468; X64-NEXT: kmovw %k0, %eax 469; X64-NEXT: movzwl %ax, %eax 470; X64-NEXT: vzeroupper 471; X64-NEXT: retq 472entry: 473 %and1.i.i = and <8 x i64> %__B, %__A 474 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> 475 %1 = icmp ne <16 x i32> %0, zeroinitializer 476 %2 = bitcast i16 %__U to <16 x i1> 477 %3 = and <16 x i1> %1, %2 478 %4 = bitcast <16 x i1> %3 to i16 479 ret i16 %4 480} 481 482define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 483; X86-LABEL: test_mm512_mask_test_epi64_mask: 484; X86: # %bb.0: # %entry 485; X86-NEXT: movb {{[0-9]+}}(%esp), %al 486; X86-NEXT: kmovw %eax, %k1 487; X86-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1} 488; X86-NEXT: kmovw %k0, %eax 489; X86-NEXT: movzbl %al, %eax 490; X86-NEXT: vzeroupper 491; X86-NEXT: retl 492; 493; X64-LABEL: test_mm512_mask_test_epi64_mask: 494; X64: # %bb.0: # %entry 495; X64-NEXT: kmovw %edi, %k1 496; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1} 497; X64-NEXT: kmovw %k0, %eax 498; X64-NEXT: movzbl %al, %eax 499; X64-NEXT: vzeroupper 500; X64-NEXT: retq 501entry: 502 %and1.i.i = and <8 x i64> %__B, %__A 503 %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer 504 %1 = bitcast i8 %__U to <8 x i1> 505 %2 = and <8 x i1> %0, %1 506 %3 = bitcast <8 x i1> %2 to i8 507 ret i8 %3 508} 509 510define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) { 511; X86-LABEL: test_mm512_mask_set1_epi32: 512; X86: # %bb.0: # %entry 513; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 514; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx 515; X86-NEXT: kmovw %ecx, %k1 516; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} 517; X86-NEXT: retl 518; 519; X64-LABEL: test_mm512_mask_set1_epi32: 520; X64: # %bb.0: # %entry 521; X64-NEXT: kmovw %edi, %k1 522; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} 523; X64-NEXT: retq 524entry: 525 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0 526 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer 527 %0 = bitcast <8 x i64> %__O to <16 x i32> 528 %1 = bitcast i16 %__M to <16 x i1> 529 %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0 530 %3 = bitcast <16 x i32> %2 to <8 x i64> 531 ret <8 x i64> %3 532} 533 534define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) { 535; X86-LABEL: test_mm512_maskz_set1_epi32: 536; X86: # %bb.0: # %entry 537; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 538; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx 539; X86-NEXT: kmovw %ecx, %k1 540; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} 541; X86-NEXT: retl 542; 543; X64-LABEL: test_mm512_maskz_set1_epi32: 544; X64: # %bb.0: # %entry 545; X64-NEXT: kmovw %edi, %k1 546; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z} 547; X64-NEXT: retq 548entry: 549 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0 550 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer 551 %0 = bitcast i16 %__M to <16 x i1> 552 %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer 553 %2 = bitcast <16 x i32> %1 to <8 x i64> 554 ret <8 x i64> %2 555} 556 557define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) { 558; X86-LABEL: test_mm512_mask_set1_epi64: 559; X86: # %bb.0: # %entry 560; X86-NEXT: movb {{[0-9]+}}(%esp), %al 561; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 562; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 563; X86-NEXT: kmovw %eax, %k1 564; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} 565; X86-NEXT: retl 566; 567; X64-LABEL: test_mm512_mask_set1_epi64: 568; X64: # %bb.0: # %entry 569; X64-NEXT: kmovw %edi, %k1 570; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} 571; X64-NEXT: retq 572entry: 573 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0 574 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer 575 %0 = bitcast i8 %__M to <8 x i1> 576 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O 577 ret <8 x i64> %1 578} 579 580define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { 581; X86-LABEL: test_mm512_maskz_set1_epi64: 582; X86: # %bb.0: # %entry 583; X86-NEXT: movb {{[0-9]+}}(%esp), %al 584; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 585; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 586; X86-NEXT: kmovw %eax, %k1 587; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} 588; X86-NEXT: retl 589; 590; X64-LABEL: test_mm512_maskz_set1_epi64: 591; X64: # %bb.0: # %entry 592; X64-NEXT: kmovw %edi, %k1 593; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z} 594; X64-NEXT: retq 595entry: 596 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0 597 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer 598 %0 = bitcast i8 %__M to <8 x i1> 599 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer 600 ret <8 x i64> %1 601} 602 603 604define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) { 605; CHECK-LABEL: test_mm512_broadcastd_epi32: 606; CHECK: # %bb.0: 607; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 608; CHECK-NEXT: ret{{[l|q]}} 609 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 610 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer 611 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 612 ret <8 x i64> %res1 613} 614 615define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) { 616; X86-LABEL: test_mm512_mask_broadcastd_epi32: 617; X86: # %bb.0: 618; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 619; X86-NEXT: kmovw %eax, %k1 620; X86-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1} 621; X86-NEXT: retl 622; 623; X64-LABEL: test_mm512_mask_broadcastd_epi32: 624; X64: # %bb.0: 625; X64-NEXT: kmovw %edi, %k1 626; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1} 627; X64-NEXT: retq 628 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 629 %arg1 = bitcast i16 %a1 to <16 x i1> 630 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 631 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer 632 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 633 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 634 ret <8 x i64> %res2 635} 636 637define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) { 638; X86-LABEL: test_mm512_maskz_broadcastd_epi32: 639; X86: # %bb.0: 640; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 641; X86-NEXT: kmovw %eax, %k1 642; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} 643; X86-NEXT: retl 644; 645; X64-LABEL: test_mm512_maskz_broadcastd_epi32: 646; X64: # %bb.0: 647; X64-NEXT: kmovw %edi, %k1 648; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} 649; X64-NEXT: retq 650 %arg0 = bitcast i16 %a0 to <16 x i1> 651 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 652 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer 653 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 654 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 655 ret <8 x i64> %res2 656} 657 658define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) { 659; CHECK-LABEL: test_mm512_broadcastq_epi64: 660; CHECK: # %bb.0: 661; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 662; CHECK-NEXT: ret{{[l|q]}} 663 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer 664 ret <8 x i64> %res 665} 666 667define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) { 668; X86-LABEL: test_mm512_mask_broadcastq_epi64: 669; X86: # %bb.0: 670; X86-NEXT: movb {{[0-9]+}}(%esp), %al 671; X86-NEXT: kmovw %eax, %k1 672; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} 673; X86-NEXT: retl 674; 675; X64-LABEL: test_mm512_mask_broadcastq_epi64: 676; X64: # %bb.0: 677; X64-NEXT: kmovw %edi, %k1 678; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} 679; X64-NEXT: retq 680 %arg1 = bitcast i8 %a1 to <8 x i1> 681 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer 682 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 683 ret <8 x i64> %res1 684} 685 686define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { 687; X86-LABEL: test_mm512_maskz_broadcastq_epi64: 688; X86: # %bb.0: 689; X86-NEXT: movb {{[0-9]+}}(%esp), %al 690; X86-NEXT: kmovw %eax, %k1 691; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} 692; X86-NEXT: retl 693; 694; X64-LABEL: test_mm512_maskz_broadcastq_epi64: 695; X64: # %bb.0: 696; X64-NEXT: kmovw %edi, %k1 697; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} 698; X64-NEXT: retq 699 %arg0 = bitcast i8 %a0 to <8 x i1> 700 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer 701 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 702 ret <8 x i64> %res1 703} 704 705define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) { 706; CHECK-LABEL: test_mm512_broadcastsd_pd: 707; CHECK: # %bb.0: 708; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 709; CHECK-NEXT: ret{{[l|q]}} 710 %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer 711 ret <8 x double> %res 712} 713 714define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) { 715; X86-LABEL: test_mm512_mask_broadcastsd_pd: 716; X86: # %bb.0: 717; X86-NEXT: movb {{[0-9]+}}(%esp), %al 718; X86-NEXT: kmovw %eax, %k1 719; X86-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} 720; X86-NEXT: retl 721; 722; X64-LABEL: test_mm512_mask_broadcastsd_pd: 723; X64: # %bb.0: 724; X64-NEXT: kmovw %edi, %k1 725; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} 726; X64-NEXT: retq 727 %arg1 = bitcast i8 %a1 to <8 x i1> 728 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer 729 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 730 ret <8 x double> %res1 731} 732 733define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { 734; X86-LABEL: test_mm512_maskz_broadcastsd_pd: 735; X86: # %bb.0: 736; X86-NEXT: movb {{[0-9]+}}(%esp), %al 737; X86-NEXT: kmovw %eax, %k1 738; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} 739; X86-NEXT: retl 740; 741; X64-LABEL: test_mm512_maskz_broadcastsd_pd: 742; X64: # %bb.0: 743; X64-NEXT: kmovw %edi, %k1 744; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} 745; X64-NEXT: retq 746 %arg0 = bitcast i8 %a0 to <8 x i1> 747 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer 748 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 749 ret <8 x double> %res1 750} 751 752define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) { 753; CHECK-LABEL: test_mm512_broadcastss_ps: 754; CHECK: # %bb.0: 755; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 756; CHECK-NEXT: ret{{[l|q]}} 757 %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer 758 ret <16 x float> %res 759} 760 761define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) { 762; X86-LABEL: test_mm512_mask_broadcastss_ps: 763; X86: # %bb.0: 764; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 765; X86-NEXT: kmovw %eax, %k1 766; X86-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} 767; X86-NEXT: retl 768; 769; X64-LABEL: test_mm512_mask_broadcastss_ps: 770; X64: # %bb.0: 771; X64-NEXT: kmovw %edi, %k1 772; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} 773; X64-NEXT: retq 774 %arg1 = bitcast i16 %a1 to <16 x i1> 775 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer 776 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 777 ret <16 x float> %res1 778} 779 780define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) { 781; X86-LABEL: test_mm512_maskz_broadcastss_ps: 782; X86: # %bb.0: 783; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 784; X86-NEXT: kmovw %eax, %k1 785; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} 786; X86-NEXT: retl 787; 788; X64-LABEL: test_mm512_maskz_broadcastss_ps: 789; X64: # %bb.0: 790; X64-NEXT: kmovw %edi, %k1 791; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} 792; X64-NEXT: retq 793 %arg0 = bitcast i16 %a0 to <16 x i1> 794 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer 795 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 796 ret <16 x float> %res1 797} 798 799define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) { 800; CHECK-LABEL: test_mm512_movedup_pd: 801; CHECK: # %bb.0: 802; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] 803; CHECK-NEXT: ret{{[l|q]}} 804 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 805 ret <8 x double> %res 806} 807 808define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) { 809; X86-LABEL: test_mm512_mask_movedup_pd: 810; X86: # %bb.0: 811; X86-NEXT: movb {{[0-9]+}}(%esp), %al 812; X86-NEXT: kmovw %eax, %k1 813; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6] 814; X86-NEXT: retl 815; 816; X64-LABEL: test_mm512_mask_movedup_pd: 817; X64: # %bb.0: 818; X64-NEXT: kmovw %edi, %k1 819; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6] 820; X64-NEXT: retq 821 %arg1 = bitcast i8 %a1 to <8 x i1> 822 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 823 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 824 ret <8 x double> %res1 825} 826 827define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) { 828; X86-LABEL: test_mm512_maskz_movedup_pd: 829; X86: # %bb.0: 830; X86-NEXT: movb {{[0-9]+}}(%esp), %al 831; X86-NEXT: kmovw %eax, %k1 832; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 833; X86-NEXT: retl 834; 835; X64-LABEL: test_mm512_maskz_movedup_pd: 836; X64: # %bb.0: 837; X64-NEXT: kmovw %edi, %k1 838; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 839; X64-NEXT: retq 840 %arg0 = bitcast i8 %a0 to <8 x i1> 841 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 842 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 843 ret <8 x double> %res1 844} 845 846define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) { 847; CHECK-LABEL: test_mm512_movehdup_ps: 848; CHECK: # %bb.0: 849; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 850; CHECK-NEXT: ret{{[l|q]}} 851 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 852 ret <16 x float> %res 853} 854 855define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { 856; X86-LABEL: test_mm512_mask_movehdup_ps: 857; X86: # %bb.0: 858; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 859; X86-NEXT: kmovw %eax, %k1 860; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 861; X86-NEXT: retl 862; 863; X64-LABEL: test_mm512_mask_movehdup_ps: 864; X64: # %bb.0: 865; X64-NEXT: kmovw %edi, %k1 866; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 867; X64-NEXT: retq 868 %arg1 = bitcast i16 %a1 to <16 x i1> 869 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 870 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 871 ret <16 x float> %res1 872} 873 874define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) { 875; X86-LABEL: test_mm512_maskz_movehdup_ps: 876; X86: # %bb.0: 877; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 878; X86-NEXT: kmovw %eax, %k1 879; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 880; X86-NEXT: retl 881; 882; X64-LABEL: test_mm512_maskz_movehdup_ps: 883; X64: # %bb.0: 884; X64-NEXT: kmovw %edi, %k1 885; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 886; X64-NEXT: retq 887 %arg0 = bitcast i16 %a0 to <16 x i1> 888 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 889 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 890 ret <16 x float> %res1 891} 892 893define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) { 894; CHECK-LABEL: test_mm512_moveldup_ps: 895; CHECK: # %bb.0: 896; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 897; CHECK-NEXT: ret{{[l|q]}} 898 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 899 ret <16 x float> %res 900} 901 902define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { 903; X86-LABEL: test_mm512_mask_moveldup_ps: 904; X86: # %bb.0: 905; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 906; X86-NEXT: kmovw %eax, %k1 907; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 908; X86-NEXT: retl 909; 910; X64-LABEL: test_mm512_mask_moveldup_ps: 911; X64: # %bb.0: 912; X64-NEXT: kmovw %edi, %k1 913; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 914; X64-NEXT: retq 915 %arg1 = bitcast i16 %a1 to <16 x i1> 916 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 917 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 918 ret <16 x float> %res1 919} 920 921define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) { 922; X86-LABEL: test_mm512_maskz_moveldup_ps: 923; X86: # %bb.0: 924; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 925; X86-NEXT: kmovw %eax, %k1 926; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 927; X86-NEXT: retl 928; 929; X64-LABEL: test_mm512_maskz_moveldup_ps: 930; X64: # %bb.0: 931; X64-NEXT: kmovw %edi, %k1 932; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 933; X64-NEXT: retq 934 %arg0 = bitcast i16 %a0 to <16 x i1> 935 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 936 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 937 ret <16 x float> %res1 938} 939 940define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) { 941; CHECK-LABEL: test_mm512_permute_pd: 942; CHECK: # %bb.0: 943; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6] 944; CHECK-NEXT: ret{{[l|q]}} 945 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 946 ret <8 x double> %res 947} 948 949define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) { 950; X86-LABEL: test_mm512_mask_permute_pd: 951; X86: # %bb.0: 952; X86-NEXT: movb {{[0-9]+}}(%esp), %al 953; X86-NEXT: kmovw %eax, %k1 954; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6] 955; X86-NEXT: retl 956; 957; X64-LABEL: test_mm512_mask_permute_pd: 958; X64: # %bb.0: 959; X64-NEXT: kmovw %edi, %k1 960; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6] 961; X64-NEXT: retq 962 %arg1 = bitcast i8 %a1 to <8 x i1> 963 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 964 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 965 ret <8 x double> %res1 966} 967 968define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) { 969; X86-LABEL: test_mm512_maskz_permute_pd: 970; X86: # %bb.0: 971; X86-NEXT: movb {{[0-9]+}}(%esp), %al 972; X86-NEXT: kmovw %eax, %k1 973; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6] 974; X86-NEXT: retl 975; 976; X64-LABEL: test_mm512_maskz_permute_pd: 977; X64: # %bb.0: 978; X64-NEXT: kmovw %edi, %k1 979; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6] 980; X64-NEXT: retq 981 %arg0 = bitcast i8 %a0 to <8 x i1> 982 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 983 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 984 ret <8 x double> %res1 985} 986 987define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) { 988; CHECK-LABEL: test_mm512_permute_ps: 989; CHECK: # %bb.0: 990; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 991; CHECK-NEXT: ret{{[l|q]}} 992 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12> 993 ret <16 x float> %res 994} 995 996define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { 997; X86-LABEL: test_mm512_mask_permute_ps: 998; X86: # %bb.0: 999; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1000; X86-NEXT: kmovw %eax, %k1 1001; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 1002; X86-NEXT: retl 1003; 1004; X64-LABEL: test_mm512_mask_permute_ps: 1005; X64: # %bb.0: 1006; X64-NEXT: kmovw %edi, %k1 1007; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 1008; X64-NEXT: retq 1009 %arg1 = bitcast i16 %a1 to <16 x i1> 1010 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12> 1011 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 1012 ret <16 x float> %res1 1013} 1014 1015define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) { 1016; X86-LABEL: test_mm512_maskz_permute_ps: 1017; X86: # %bb.0: 1018; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1019; X86-NEXT: kmovw %eax, %k1 1020; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 1021; X86-NEXT: retl 1022; 1023; X64-LABEL: test_mm512_maskz_permute_ps: 1024; X64: # %bb.0: 1025; X64-NEXT: kmovw %edi, %k1 1026; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 1027; X64-NEXT: retq 1028 %arg0 = bitcast i16 %a0 to <16 x i1> 1029 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12> 1030 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 1031 ret <16 x float> %res1 1032} 1033 1034define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) { 1035; CHECK-LABEL: test_mm512_permutex_epi64: 1036; CHECK: # %bb.0: 1037; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4] 1038; CHECK-NEXT: ret{{[l|q]}} 1039 %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1040 ret <8 x i64> %res 1041} 1042 1043define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) { 1044; X86-LABEL: test_mm512_mask_permutex_epi64: 1045; X86: # %bb.0: 1046; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1047; X86-NEXT: kmovw %eax, %k1 1048; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1049; X86-NEXT: retl 1050; 1051; X64-LABEL: test_mm512_mask_permutex_epi64: 1052; X64: # %bb.0: 1053; X64-NEXT: kmovw %edi, %k1 1054; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1055; X64-NEXT: retq 1056 %arg1 = bitcast i8 %a1 to <8 x i1> 1057 %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1058 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 1059 ret <8 x i64> %res1 1060} 1061 1062define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) { 1063; X86-LABEL: test_mm512_maskz_permutex_epi64: 1064; X86: # %bb.0: 1065; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1066; X86-NEXT: kmovw %eax, %k1 1067; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1068; X86-NEXT: retl 1069; 1070; X64-LABEL: test_mm512_maskz_permutex_epi64: 1071; X64: # %bb.0: 1072; X64-NEXT: kmovw %edi, %k1 1073; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1074; X64-NEXT: retq 1075 %arg0 = bitcast i8 %a0 to <8 x i1> 1076 %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1077 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 1078 ret <8 x i64> %res1 1079} 1080 1081define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) { 1082; CHECK-LABEL: test_mm512_permutex_pd: 1083; CHECK: # %bb.0: 1084; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4] 1085; CHECK-NEXT: ret{{[l|q]}} 1086 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1087 ret <8 x double> %res 1088} 1089 1090define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) { 1091; X86-LABEL: test_mm512_mask_permutex_pd: 1092; X86: # %bb.0: 1093; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1094; X86-NEXT: kmovw %eax, %k1 1095; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1096; X86-NEXT: retl 1097; 1098; X64-LABEL: test_mm512_mask_permutex_pd: 1099; X64: # %bb.0: 1100; X64-NEXT: kmovw %edi, %k1 1101; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1102; X64-NEXT: retq 1103 %arg1 = bitcast i8 %a1 to <8 x i1> 1104 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1105 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1106 ret <8 x double> %res1 1107} 1108 1109define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) { 1110; X86-LABEL: test_mm512_maskz_permutex_pd: 1111; X86: # %bb.0: 1112; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1113; X86-NEXT: kmovw %eax, %k1 1114; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1115; X86-NEXT: retl 1116; 1117; X64-LABEL: test_mm512_maskz_permutex_pd: 1118; X64: # %bb.0: 1119; X64-NEXT: kmovw %edi, %k1 1120; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1121; X64-NEXT: retq 1122 %arg0 = bitcast i8 %a0 to <8 x i1> 1123 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1124 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1125 ret <8 x double> %res1 1126} 1127 1128define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) { 1129; CHECK-LABEL: test_mm512_shuffle_epi32: 1130; CHECK: # %bb.0: 1131; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1132; CHECK-NEXT: ret{{[l|q]}} 1133 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1134 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12> 1135 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 1136 ret <8 x i64> %res1 1137} 1138 1139define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) { 1140; X86-LABEL: test_mm512_mask_shuffle_epi32: 1141; X86: # %bb.0: 1142; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1143; X86-NEXT: kmovw %eax, %k1 1144; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1145; X86-NEXT: retl 1146; 1147; X64-LABEL: test_mm512_mask_shuffle_epi32: 1148; X64: # %bb.0: 1149; X64-NEXT: kmovw %edi, %k1 1150; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1151; X64-NEXT: retq 1152 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1153 %arg1 = bitcast i16 %a1 to <16 x i1> 1154 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1155 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12> 1156 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 1157 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1158 ret <8 x i64> %res2 1159} 1160 1161define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) { 1162; X86-LABEL: test_mm512_maskz_shuffle_epi32: 1163; X86: # %bb.0: 1164; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1165; X86-NEXT: kmovw %eax, %k1 1166; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1167; X86-NEXT: retl 1168; 1169; X64-LABEL: test_mm512_maskz_shuffle_epi32: 1170; X64: # %bb.0: 1171; X64-NEXT: kmovw %edi, %k1 1172; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1173; X64-NEXT: retq 1174 %arg0 = bitcast i16 %a0 to <16 x i1> 1175 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1176 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12> 1177 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 1178 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1179 ret <8 x i64> %res2 1180} 1181 1182define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) { 1183; CHECK-LABEL: test_mm512_shuffle_pd: 1184; CHECK: # %bb.0: 1185; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1186; CHECK-NEXT: ret{{[l|q]}} 1187 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14> 1188 ret <8 x double> %res 1189} 1190 1191define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) { 1192; X86-LABEL: test_mm512_mask_shuffle_pd: 1193; X86: # %bb.0: 1194; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1195; X86-NEXT: kmovw %eax, %k1 1196; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1197; X86-NEXT: retl 1198; 1199; X64-LABEL: test_mm512_mask_shuffle_pd: 1200; X64: # %bb.0: 1201; X64-NEXT: kmovw %edi, %k1 1202; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1203; X64-NEXT: retq 1204 %arg1 = bitcast i8 %a1 to <8 x i1> 1205 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14> 1206 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1207 ret <8 x double> %res1 1208} 1209 1210define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) { 1211; X86-LABEL: test_mm512_maskz_shuffle_pd: 1212; X86: # %bb.0: 1213; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1214; X86-NEXT: kmovw %eax, %k1 1215; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1216; X86-NEXT: retl 1217; 1218; X64-LABEL: test_mm512_maskz_shuffle_pd: 1219; X64: # %bb.0: 1220; X64-NEXT: kmovw %edi, %k1 1221; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1222; X64-NEXT: retq 1223 %arg0 = bitcast i8 %a0 to <8 x i1> 1224 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14> 1225 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1226 ret <8 x double> %res1 1227} 1228 1229define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) { 1230; CHECK-LABEL: test_mm512_unpackhi_epi32: 1231; CHECK: # %bb.0: 1232; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1233; CHECK-NEXT: ret{{[l|q]}} 1234 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1235 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1236 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1237 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 1238 ret <8 x i64> %res1 1239} 1240 1241define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1242; X86-LABEL: test_mm512_mask_unpackhi_epi32: 1243; X86: # %bb.0: 1244; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1245; X86-NEXT: kmovw %eax, %k1 1246; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1247; X86-NEXT: retl 1248; 1249; X64-LABEL: test_mm512_mask_unpackhi_epi32: 1250; X64: # %bb.0: 1251; X64-NEXT: kmovw %edi, %k1 1252; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1253; X64-NEXT: retq 1254 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1255 %arg1 = bitcast i16 %a1 to <16 x i1> 1256 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1257 %arg3 = bitcast <8 x i64> %a3 to <16 x i32> 1258 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1259 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 1260 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1261 ret <8 x i64> %res2 1262} 1263 1264define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1265; X86-LABEL: test_mm512_maskz_unpackhi_epi32: 1266; X86: # %bb.0: 1267; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1268; X86-NEXT: kmovw %eax, %k1 1269; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1270; X86-NEXT: retl 1271; 1272; X64-LABEL: test_mm512_maskz_unpackhi_epi32: 1273; X64: # %bb.0: 1274; X64-NEXT: kmovw %edi, %k1 1275; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1276; X64-NEXT: retq 1277 %arg0 = bitcast i16 %a0 to <16 x i1> 1278 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1279 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1280 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1281 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 1282 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1283 ret <8 x i64> %res2 1284} 1285 1286define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) { 1287; CHECK-LABEL: test_mm512_unpackhi_epi64: 1288; CHECK: # %bb.0: 1289; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1290; CHECK-NEXT: ret{{[l|q]}} 1291 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1292 ret <8 x i64> %res 1293} 1294 1295define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1296; X86-LABEL: test_mm512_mask_unpackhi_epi64: 1297; X86: # %bb.0: 1298; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1299; X86-NEXT: kmovw %eax, %k1 1300; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1301; X86-NEXT: retl 1302; 1303; X64-LABEL: test_mm512_mask_unpackhi_epi64: 1304; X64: # %bb.0: 1305; X64-NEXT: kmovw %edi, %k1 1306; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1307; X64-NEXT: retq 1308 %arg1 = bitcast i8 %a1 to <8 x i1> 1309 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1310 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 1311 ret <8 x i64> %res1 1312} 1313 1314define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1315; X86-LABEL: test_mm512_maskz_unpackhi_epi64: 1316; X86: # %bb.0: 1317; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1318; X86-NEXT: kmovw %eax, %k1 1319; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1320; X86-NEXT: retl 1321; 1322; X64-LABEL: test_mm512_maskz_unpackhi_epi64: 1323; X64: # %bb.0: 1324; X64-NEXT: kmovw %edi, %k1 1325; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1326; X64-NEXT: retq 1327 %arg0 = bitcast i8 %a0 to <8 x i1> 1328 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1329 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 1330 ret <8 x i64> %res1 1331} 1332 1333define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) { 1334; CHECK-LABEL: test_mm512_unpackhi_pd: 1335; CHECK: # %bb.0: 1336; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1337; CHECK-NEXT: ret{{[l|q]}} 1338 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1339 ret <8 x double> %res 1340} 1341 1342define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) { 1343; X86-LABEL: test_mm512_mask_unpackhi_pd: 1344; X86: # %bb.0: 1345; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1346; X86-NEXT: kmovw %eax, %k1 1347; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1348; X86-NEXT: retl 1349; 1350; X64-LABEL: test_mm512_mask_unpackhi_pd: 1351; X64: # %bb.0: 1352; X64-NEXT: kmovw %edi, %k1 1353; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1354; X64-NEXT: retq 1355 %arg1 = bitcast i8 %a1 to <8 x i1> 1356 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1357 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1358 ret <8 x double> %res1 1359} 1360 1361define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) { 1362; X86-LABEL: test_mm512_maskz_unpackhi_pd: 1363; X86: # %bb.0: 1364; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1365; X86-NEXT: kmovw %eax, %k1 1366; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1367; X86-NEXT: retl 1368; 1369; X64-LABEL: test_mm512_maskz_unpackhi_pd: 1370; X64: # %bb.0: 1371; X64-NEXT: kmovw %edi, %k1 1372; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1373; X64-NEXT: retq 1374 %arg0 = bitcast i8 %a0 to <8 x i1> 1375 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1376 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1377 ret <8 x double> %res1 1378} 1379 1380define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) { 1381; CHECK-LABEL: test_mm512_unpackhi_ps: 1382; CHECK: # %bb.0: 1383; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1384; CHECK-NEXT: ret{{[l|q]}} 1385 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1386 ret <16 x float> %res 1387} 1388 1389define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { 1390; X86-LABEL: test_mm512_mask_unpackhi_ps: 1391; X86: # %bb.0: 1392; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1393; X86-NEXT: kmovw %eax, %k1 1394; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1395; X86-NEXT: retl 1396; 1397; X64-LABEL: test_mm512_mask_unpackhi_ps: 1398; X64: # %bb.0: 1399; X64-NEXT: kmovw %edi, %k1 1400; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1401; X64-NEXT: retq 1402 %arg1 = bitcast i16 %a1 to <16 x i1> 1403 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1404 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 1405 ret <16 x float> %res1 1406} 1407 1408define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { 1409; X86-LABEL: test_mm512_maskz_unpackhi_ps: 1410; X86: # %bb.0: 1411; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1412; X86-NEXT: kmovw %eax, %k1 1413; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1414; X86-NEXT: retl 1415; 1416; X64-LABEL: test_mm512_maskz_unpackhi_ps: 1417; X64: # %bb.0: 1418; X64-NEXT: kmovw %edi, %k1 1419; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1420; X64-NEXT: retq 1421 %arg0 = bitcast i16 %a0 to <16 x i1> 1422 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1423 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 1424 ret <16 x float> %res1 1425} 1426 1427define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) { 1428; CHECK-LABEL: test_mm512_unpacklo_epi32: 1429; CHECK: # %bb.0: 1430; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1431; CHECK-NEXT: ret{{[l|q]}} 1432 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1433 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1434 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1435 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 1436 ret <8 x i64> %res1 1437} 1438 1439define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1440; X86-LABEL: test_mm512_mask_unpacklo_epi32: 1441; X86: # %bb.0: 1442; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1443; X86-NEXT: kmovw %eax, %k1 1444; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1445; X86-NEXT: retl 1446; 1447; X64-LABEL: test_mm512_mask_unpacklo_epi32: 1448; X64: # %bb.0: 1449; X64-NEXT: kmovw %edi, %k1 1450; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1451; X64-NEXT: retq 1452 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1453 %arg1 = bitcast i16 %a1 to <16 x i1> 1454 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1455 %arg3 = bitcast <8 x i64> %a3 to <16 x i32> 1456 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1457 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 1458 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1459 ret <8 x i64> %res2 1460} 1461 1462define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1463; X86-LABEL: test_mm512_maskz_unpacklo_epi32: 1464; X86: # %bb.0: 1465; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1466; X86-NEXT: kmovw %eax, %k1 1467; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1468; X86-NEXT: retl 1469; 1470; X64-LABEL: test_mm512_maskz_unpacklo_epi32: 1471; X64: # %bb.0: 1472; X64-NEXT: kmovw %edi, %k1 1473; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1474; X64-NEXT: retq 1475 %arg0 = bitcast i16 %a0 to <16 x i1> 1476 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1477 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1478 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1479 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 1480 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1481 ret <8 x i64> %res2 1482} 1483 1484define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) { 1485; CHECK-LABEL: test_mm512_unpacklo_epi64: 1486; CHECK: # %bb.0: 1487; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1488; CHECK-NEXT: ret{{[l|q]}} 1489 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1490 ret <8 x i64> %res 1491} 1492 1493define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1494; X86-LABEL: test_mm512_mask_unpacklo_epi64: 1495; X86: # %bb.0: 1496; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1497; X86-NEXT: kmovw %eax, %k1 1498; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1499; X86-NEXT: retl 1500; 1501; X64-LABEL: test_mm512_mask_unpacklo_epi64: 1502; X64: # %bb.0: 1503; X64-NEXT: kmovw %edi, %k1 1504; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1505; X64-NEXT: retq 1506 %arg1 = bitcast i8 %a1 to <8 x i1> 1507 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1508 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 1509 ret <8 x i64> %res1 1510} 1511 1512define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1513; X86-LABEL: test_mm512_maskz_unpacklo_epi64: 1514; X86: # %bb.0: 1515; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1516; X86-NEXT: kmovw %eax, %k1 1517; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1518; X86-NEXT: retl 1519; 1520; X64-LABEL: test_mm512_maskz_unpacklo_epi64: 1521; X64: # %bb.0: 1522; X64-NEXT: kmovw %edi, %k1 1523; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1524; X64-NEXT: retq 1525 %arg0 = bitcast i8 %a0 to <8 x i1> 1526 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1527 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 1528 ret <8 x i64> %res1 1529} 1530 1531define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) { 1532; CHECK-LABEL: test_mm512_unpacklo_pd: 1533; CHECK: # %bb.0: 1534; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1535; CHECK-NEXT: ret{{[l|q]}} 1536 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1537 ret <8 x double> %res 1538} 1539 1540define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) { 1541; X86-LABEL: test_mm512_mask_unpacklo_pd: 1542; X86: # %bb.0: 1543; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1544; X86-NEXT: kmovw %eax, %k1 1545; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1546; X86-NEXT: retl 1547; 1548; X64-LABEL: test_mm512_mask_unpacklo_pd: 1549; X64: # %bb.0: 1550; X64-NEXT: kmovw %edi, %k1 1551; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1552; X64-NEXT: retq 1553 %arg1 = bitcast i8 %a1 to <8 x i1> 1554 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1555 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1556 ret <8 x double> %res1 1557} 1558 1559define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) { 1560; X86-LABEL: test_mm512_maskz_unpacklo_pd: 1561; X86: # %bb.0: 1562; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1563; X86-NEXT: kmovw %eax, %k1 1564; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1565; X86-NEXT: retl 1566; 1567; X64-LABEL: test_mm512_maskz_unpacklo_pd: 1568; X64: # %bb.0: 1569; X64-NEXT: kmovw %edi, %k1 1570; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1571; X64-NEXT: retq 1572 %arg0 = bitcast i8 %a0 to <8 x i1> 1573 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1574 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1575 ret <8 x double> %res1 1576} 1577 1578define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) { 1579; CHECK-LABEL: test_mm512_unpacklo_ps: 1580; CHECK: # %bb.0: 1581; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1582; CHECK-NEXT: ret{{[l|q]}} 1583 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1584 ret <16 x float> %res 1585} 1586 1587define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { 1588; X86-LABEL: test_mm512_mask_unpacklo_ps: 1589; X86: # %bb.0: 1590; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1591; X86-NEXT: kmovw %eax, %k1 1592; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1593; X86-NEXT: retl 1594; 1595; X64-LABEL: test_mm512_mask_unpacklo_ps: 1596; X64: # %bb.0: 1597; X64-NEXT: kmovw %edi, %k1 1598; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1599; X64-NEXT: retq 1600 %arg1 = bitcast i16 %a1 to <16 x i1> 1601 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1602 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 1603 ret <16 x float> %res1 1604} 1605 1606define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { 1607; X86-LABEL: test_mm512_maskz_unpacklo_ps: 1608; X86: # %bb.0: 1609; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1610; X86-NEXT: kmovw %eax, %k1 1611; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1612; X86-NEXT: retl 1613; 1614; X64-LABEL: test_mm512_maskz_unpacklo_ps: 1615; X64: # %bb.0: 1616; X64-NEXT: kmovw %edi, %k1 1617; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1618; X64-NEXT: retq 1619 %arg0 = bitcast i16 %a0 to <16 x i1> 1620 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1621 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 1622 ret <16 x float> %res1 1623} 1624 1625define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind { 1626; CHECK-LABEL: test_mm512_zextpd128_pd512: 1627; CHECK: # %bb.0: 1628; CHECK-NEXT: vmovaps %xmm0, %xmm0 1629; CHECK-NEXT: ret{{[l|q]}} 1630 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1631 ret <8 x double> %res 1632} 1633 1634define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind { 1635; CHECK-LABEL: test_mm512_zextpd256_pd512: 1636; CHECK: # %bb.0: 1637; CHECK-NEXT: vmovaps %ymm0, %ymm0 1638; CHECK-NEXT: ret{{[l|q]}} 1639 %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1640 ret <8 x double> %res 1641} 1642 1643define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind { 1644; CHECK-LABEL: test_mm512_zextps128_ps512: 1645; CHECK: # %bb.0: 1646; CHECK-NEXT: vmovaps %xmm0, %xmm0 1647; CHECK-NEXT: ret{{[l|q]}} 1648 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 1649 ret <16 x float> %res 1650} 1651 1652define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind { 1653; CHECK-LABEL: test_mm512_zextps256_ps512: 1654; CHECK: # %bb.0: 1655; CHECK-NEXT: vmovaps %ymm0, %ymm0 1656; CHECK-NEXT: ret{{[l|q]}} 1657 %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1658 ret <16 x float> %res 1659} 1660 1661define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind { 1662; CHECK-LABEL: test_mm512_zextsi128_si512: 1663; CHECK: # %bb.0: 1664; CHECK-NEXT: vmovaps %xmm0, %xmm0 1665; CHECK-NEXT: ret{{[l|q]}} 1666 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1667 ret <8 x i64> %res 1668} 1669 1670define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind { 1671; CHECK-LABEL: test_mm512_zextsi256_si512: 1672; CHECK: # %bb.0: 1673; CHECK-NEXT: vmovaps %ymm0, %ymm0 1674; CHECK-NEXT: ret{{[l|q]}} 1675 %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1676 ret <8 x i64> %res 1677} 1678 1679define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind { 1680; CHECK-LABEL: test_mm512_mul_epi32: 1681; CHECK: # %bb.0: 1682; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 1683; CHECK-NEXT: ret{{[l|q]}} 1684 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1685 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1686 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1687 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1688 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1 1689 ret <8 x i64> %tmp4 1690} 1691 1692define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind { 1693; X86-LABEL: test_mm512_maskz_mul_epi32: 1694; X86: # %bb.0: # %entry 1695; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1696; X86-NEXT: kmovw %eax, %k1 1697; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z} 1698; X86-NEXT: retl 1699; 1700; X64-LABEL: test_mm512_maskz_mul_epi32: 1701; X64: # %bb.0: # %entry 1702; X64-NEXT: kmovw %edi, %k1 1703; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z} 1704; X64-NEXT: retq 1705entry: 1706 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1707 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1708 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1709 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1710 %4 = mul nsw <8 x i64> %3, %1 1711 %5 = bitcast i8 %__k to <8 x i1> 1712 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer 1713 ret <8 x i64> %6 1714} 1715 1716define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind { 1717; X86-LABEL: test_mm512_mask_mul_epi32: 1718; X86: # %bb.0: # %entry 1719; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1720; X86-NEXT: kmovw %eax, %k1 1721; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1} 1722; X86-NEXT: vmovdqa64 %zmm2, %zmm0 1723; X86-NEXT: retl 1724; 1725; X64-LABEL: test_mm512_mask_mul_epi32: 1726; X64: # %bb.0: # %entry 1727; X64-NEXT: kmovw %edi, %k1 1728; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1} 1729; X64-NEXT: vmovdqa64 %zmm2, %zmm0 1730; X64-NEXT: retq 1731entry: 1732 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1733 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1734 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1735 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1736 %4 = mul nsw <8 x i64> %3, %1 1737 %5 = bitcast i8 %__k to <8 x i1> 1738 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src 1739 ret <8 x i64> %6 1740} 1741 1742define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind { 1743; CHECK-LABEL: test_mm512_mul_epu32: 1744; CHECK: # %bb.0: 1745; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 1746; CHECK-NEXT: ret{{[l|q]}} 1747 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1748 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1749 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp 1750 ret <8 x i64> %tmp2 1751} 1752 1753define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind { 1754; X86-LABEL: test_mm512_maskz_mul_epu32: 1755; X86: # %bb.0: # %entry 1756; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1757; X86-NEXT: kmovw %eax, %k1 1758; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z} 1759; X86-NEXT: retl 1760; 1761; X64-LABEL: test_mm512_maskz_mul_epu32: 1762; X64: # %bb.0: # %entry 1763; X64-NEXT: kmovw %edi, %k1 1764; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z} 1765; X64-NEXT: retq 1766entry: 1767 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1768 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1769 %2 = mul nuw <8 x i64> %1, %0 1770 %3 = bitcast i8 %__k to <8 x i1> 1771 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1772 ret <8 x i64> %4 1773} 1774 1775define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind { 1776; X86-LABEL: test_mm512_mask_mul_epu32: 1777; X86: # %bb.0: # %entry 1778; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1779; X86-NEXT: kmovw %eax, %k1 1780; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1} 1781; X86-NEXT: vmovdqa64 %zmm2, %zmm0 1782; X86-NEXT: retl 1783; 1784; X64-LABEL: test_mm512_mask_mul_epu32: 1785; X64: # %bb.0: # %entry 1786; X64-NEXT: kmovw %edi, %k1 1787; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1} 1788; X64-NEXT: vmovdqa64 %zmm2, %zmm0 1789; X64-NEXT: retq 1790entry: 1791 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1792 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1793 %2 = mul nuw <8 x i64> %1, %0 1794 %3 = bitcast i8 %__k to <8 x i1> 1795 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src 1796 ret <8 x i64> %4 1797} 1798 1799define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind { 1800; X86-LABEL: test_mm512_set1_epi8: 1801; X86: # %bb.0: # %entry 1802; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1803; X86-NEXT: vmovd %eax, %xmm0 1804; X86-NEXT: vpbroadcastb %xmm0, %ymm0 1805; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1806; X86-NEXT: retl 1807; 1808; X64-LABEL: test_mm512_set1_epi8: 1809; X64: # %bb.0: # %entry 1810; X64-NEXT: vmovd %edi, %xmm0 1811; X64-NEXT: vpbroadcastb %xmm0, %ymm0 1812; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1813; X64-NEXT: retq 1814entry: 1815 %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0 1816 %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer 1817 %0 = bitcast <64 x i8> %vecinit63.i to <8 x double> 1818 ret <8 x double> %0 1819} 1820 1821define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) { 1822; X86-LABEL: test_mm_cvtu32_sd: 1823; X86: # %bb.0: # %entry 1824; X86-NEXT: vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0 1825; X86-NEXT: retl 1826; 1827; X64-LABEL: test_mm_cvtu32_sd: 1828; X64: # %bb.0: # %entry 1829; X64-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 1830; X64-NEXT: retq 1831entry: 1832 %conv.i = uitofp i32 %__B to double 1833 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0 1834 ret <2 x double> %vecins.i 1835} 1836 1837define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) { 1838; X86-LABEL: test_mm_cvtu64_sd: 1839; X86: # %bb.0: # %entry 1840; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1841; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1842; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 1843; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1 1844; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1845; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1 1846; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1847; X86-NEXT: retl 1848; 1849; X64-LABEL: test_mm_cvtu64_sd: 1850; X64: # %bb.0: # %entry 1851; X64-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 1852; X64-NEXT: retq 1853entry: 1854 %conv.i = uitofp i64 %__B to double 1855 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0 1856 ret <2 x double> %vecins.i 1857} 1858 1859define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) { 1860; X86-LABEL: test_mm_cvtu32_ss: 1861; X86: # %bb.0: # %entry 1862; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 1863; X86-NEXT: retl 1864; 1865; X64-LABEL: test_mm_cvtu32_ss: 1866; X64: # %bb.0: # %entry 1867; X64-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 1868; X64-NEXT: retq 1869entry: 1870 %conv.i = uitofp i32 %__B to float 1871 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0 1872 ret <4 x float> %vecins.i 1873} 1874 1875define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) { 1876; X86-LABEL: test_mm_cvtu64_ss: 1877; X86: # %bb.0: # %entry 1878; X86-NEXT: pushl %ebp 1879; X86-NEXT: .cfi_def_cfa_offset 8 1880; X86-NEXT: .cfi_offset %ebp, -8 1881; X86-NEXT: movl %esp, %ebp 1882; X86-NEXT: .cfi_def_cfa_register %ebp 1883; X86-NEXT: andl $-8, %esp 1884; X86-NEXT: subl $16, %esp 1885; X86-NEXT: movl 12(%ebp), %eax 1886; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1887; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 1888; X86-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) 1889; X86-NEXT: shrl $31, %eax 1890; X86-NEXT: fildll {{[0-9]+}}(%esp) 1891; X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4) 1892; X86-NEXT: fstps {{[0-9]+}}(%esp) 1893; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1894; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1895; X86-NEXT: movl %ebp, %esp 1896; X86-NEXT: popl %ebp 1897; X86-NEXT: .cfi_def_cfa %esp, 4 1898; X86-NEXT: retl 1899; 1900; X64-LABEL: test_mm_cvtu64_ss: 1901; X64: # %bb.0: # %entry 1902; X64-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 1903; X64-NEXT: retq 1904entry: 1905 %conv.i = uitofp i64 %__B to float 1906 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0 1907 ret <4 x float> %vecins.i 1908} 1909 1910define <16 x float> @test_mm512_cvtph_ps(<4 x i64> %__A) { 1911; CHECK-LABEL: test_mm512_cvtph_ps: 1912; CHECK: # %bb.0: # %entry 1913; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0 1914; CHECK-NEXT: ret{{[l|q]}} 1915entry: 1916 %0 = bitcast <4 x i64> %__A to <16 x i16> 1917 %1 = bitcast <16 x i16> %0 to <16 x half> 1918 %2 = fpext <16 x half> %1 to <16 x float> 1919 ret <16 x float> %2 1920} 1921 1922define <16 x float> @test_mm512_mask_cvtph_ps(<16 x float> %__W, i16 zeroext %__U, <4 x i64> %__A) { 1923; X86-LABEL: test_mm512_mask_cvtph_ps: 1924; X86: # %bb.0: # %entry 1925; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1926; X86-NEXT: kmovw %eax, %k1 1927; X86-NEXT: vcvtph2ps %ymm1, %zmm0 {%k1} 1928; X86-NEXT: retl 1929; 1930; X64-LABEL: test_mm512_mask_cvtph_ps: 1931; X64: # %bb.0: # %entry 1932; X64-NEXT: kmovw %edi, %k1 1933; X64-NEXT: vcvtph2ps %ymm1, %zmm0 {%k1} 1934; X64-NEXT: retq 1935entry: 1936 %0 = bitcast <4 x i64> %__A to <16 x i16> 1937 %1 = bitcast <16 x i16> %0 to <16 x half> 1938 %2 = bitcast i16 %__U to <16 x i1> 1939 %3 = fpext <16 x half> %1 to <16 x float> 1940 %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> %__W 1941 ret <16 x float> %4 1942} 1943 1944define <16 x float> @test_mm512_maskz_cvtph_ps(i16 zeroext %__U, <4 x i64> %__A) { 1945; X86-LABEL: test_mm512_maskz_cvtph_ps: 1946; X86: # %bb.0: # %entry 1947; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1948; X86-NEXT: kmovw %eax, %k1 1949; X86-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z} 1950; X86-NEXT: retl 1951; 1952; X64-LABEL: test_mm512_maskz_cvtph_ps: 1953; X64: # %bb.0: # %entry 1954; X64-NEXT: kmovw %edi, %k1 1955; X64-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z} 1956; X64-NEXT: retq 1957entry: 1958 %0 = bitcast <4 x i64> %__A to <16 x i16> 1959 %1 = bitcast <16 x i16> %0 to <16 x half> 1960 %2 = bitcast i16 %__U to <16 x i1> 1961 %3 = fpext <16 x half> %1 to <16 x float> 1962 %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> zeroinitializer 1963 ret <16 x float> %4 1964} 1965 1966define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) { 1967; CHECK-LABEL: test_mm512_cvtps_pd: 1968; CHECK: # %bb.0: # %entry 1969; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0 1970; CHECK-NEXT: ret{{[l|q]}} 1971entry: 1972 %conv.i = fpext <8 x float> %__A to <8 x double> 1973 ret <8 x double> %conv.i 1974} 1975 1976define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) { 1977; CHECK-LABEL: test_mm512_cvtpslo_pd: 1978; CHECK: # %bb.0: # %entry 1979; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0 1980; CHECK-NEXT: ret{{[l|q]}} 1981entry: 1982 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1983 %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double> 1984 ret <8 x double> %conv.i.i 1985} 1986 1987define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) { 1988; X86-LABEL: test_mm512_mask_cvtps_pd: 1989; X86: # %bb.0: # %entry 1990; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1991; X86-NEXT: kmovw %eax, %k1 1992; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 1993; X86-NEXT: retl 1994; 1995; X64-LABEL: test_mm512_mask_cvtps_pd: 1996; X64: # %bb.0: # %entry 1997; X64-NEXT: kmovw %edi, %k1 1998; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 1999; X64-NEXT: retq 2000entry: 2001 %conv.i.i = fpext <8 x float> %__A to <8 x double> 2002 %0 = bitcast i8 %__U to <8 x i1> 2003 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W 2004 ret <8 x double> %1 2005} 2006 2007define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) { 2008; X86-LABEL: test_mm512_mask_cvtpslo_pd: 2009; X86: # %bb.0: # %entry 2010; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2011; X86-NEXT: kmovw %eax, %k1 2012; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 2013; X86-NEXT: retl 2014; 2015; X64-LABEL: test_mm512_mask_cvtpslo_pd: 2016; X64: # %bb.0: # %entry 2017; X64-NEXT: kmovw %edi, %k1 2018; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 2019; X64-NEXT: retq 2020entry: 2021 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2022 %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double> 2023 %0 = bitcast i8 %__U to <8 x i1> 2024 %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W 2025 ret <8 x double> %1 2026} 2027 2028define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) { 2029; X86-LABEL: test_mm512_maskz_cvtps_pd: 2030; X86: # %bb.0: # %entry 2031; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2032; X86-NEXT: kmovw %eax, %k1 2033; X86-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z} 2034; X86-NEXT: retl 2035; 2036; X64-LABEL: test_mm512_maskz_cvtps_pd: 2037; X64: # %bb.0: # %entry 2038; X64-NEXT: kmovw %edi, %k1 2039; X64-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z} 2040; X64-NEXT: retq 2041entry: 2042 %conv.i.i = fpext <8 x float> %__A to <8 x double> 2043 %0 = bitcast i8 %__U to <8 x i1> 2044 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer 2045 ret <8 x double> %1 2046} 2047 2048define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) { 2049; CHECK-LABEL: test_mm512_cvtepi32_epi8: 2050; CHECK: # %bb.0: # %entry 2051; CHECK-NEXT: vpmovdb %zmm0, %xmm0 2052; CHECK-NEXT: vzeroupper 2053; CHECK-NEXT: ret{{[l|q]}} 2054entry: 2055 %0 = bitcast <8 x i64> %__A to <16 x i32> 2056 %conv.i = trunc <16 x i32> %0 to <16 x i8> 2057 %1 = bitcast <16 x i8> %conv.i to <2 x i64> 2058 ret <2 x i64> %1 2059} 2060 2061define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) { 2062; X86-LABEL: test_mm512_mask_cvtepi32_epi8: 2063; X86: # %bb.0: # %entry 2064; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2065; X86-NEXT: kmovw %eax, %k1 2066; X86-NEXT: vpmovdb %zmm1, %xmm0 {%k1} 2067; X86-NEXT: vzeroupper 2068; X86-NEXT: retl 2069; 2070; X64-LABEL: test_mm512_mask_cvtepi32_epi8: 2071; X64: # %bb.0: # %entry 2072; X64-NEXT: kmovw %edi, %k1 2073; X64-NEXT: vpmovdb %zmm1, %xmm0 {%k1} 2074; X64-NEXT: vzeroupper 2075; X64-NEXT: retq 2076entry: 2077 %0 = bitcast <8 x i64> %__A to <16 x i32> 2078 %1 = bitcast <2 x i64> %__O to <16 x i8> 2079 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M) 2080 %3 = bitcast <16 x i8> %2 to <2 x i64> 2081 ret <2 x i64> %3 2082} 2083 2084define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) { 2085; X86-LABEL: test_mm512_maskz_cvtepi32_epi8: 2086; X86: # %bb.0: # %entry 2087; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2088; X86-NEXT: kmovw %eax, %k1 2089; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z} 2090; X86-NEXT: vzeroupper 2091; X86-NEXT: retl 2092; 2093; X64-LABEL: test_mm512_maskz_cvtepi32_epi8: 2094; X64: # %bb.0: # %entry 2095; X64-NEXT: kmovw %edi, %k1 2096; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z} 2097; X64-NEXT: vzeroupper 2098; X64-NEXT: retq 2099entry: 2100 %0 = bitcast <8 x i64> %__A to <16 x i32> 2101 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M) 2102 %2 = bitcast <16 x i8> %1 to <2 x i64> 2103 ret <2 x i64> %2 2104} 2105 2106define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) { 2107; CHECK-LABEL: test_mm512_cvtepi64_epi32: 2108; CHECK: # %bb.0: # %entry 2109; CHECK-NEXT: vpmovqd %zmm0, %ymm0 2110; CHECK-NEXT: ret{{[l|q]}} 2111entry: 2112 %conv.i = trunc <8 x i64> %__A to <8 x i32> 2113 %0 = bitcast <8 x i32> %conv.i to <4 x i64> 2114 ret <4 x i64> %0 2115} 2116 2117define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) { 2118; X86-LABEL: test_mm512_mask_cvtepi64_epi32: 2119; X86: # %bb.0: # %entry 2120; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2121; X86-NEXT: kmovw %eax, %k1 2122; X86-NEXT: vpmovqd %zmm1, %ymm0 {%k1} 2123; X86-NEXT: retl 2124; 2125; X64-LABEL: test_mm512_mask_cvtepi64_epi32: 2126; X64: # %bb.0: # %entry 2127; X64-NEXT: kmovw %edi, %k1 2128; X64-NEXT: vpmovqd %zmm1, %ymm0 {%k1} 2129; X64-NEXT: retq 2130entry: 2131 %conv.i.i = trunc <8 x i64> %__A to <8 x i32> 2132 %0 = bitcast <4 x i64> %__O to <8 x i32> 2133 %1 = bitcast i8 %__M to <8 x i1> 2134 %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0 2135 %3 = bitcast <8 x i32> %2 to <4 x i64> 2136 ret <4 x i64> %3 2137} 2138 2139define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) { 2140; X86-LABEL: test_mm512_maskz_cvtepi64_epi32: 2141; X86: # %bb.0: # %entry 2142; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2143; X86-NEXT: kmovw %eax, %k1 2144; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} 2145; X86-NEXT: retl 2146; 2147; X64-LABEL: test_mm512_maskz_cvtepi64_epi32: 2148; X64: # %bb.0: # %entry 2149; X64-NEXT: kmovw %edi, %k1 2150; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} 2151; X64-NEXT: retq 2152entry: 2153 %conv.i.i = trunc <8 x i64> %__A to <8 x i32> 2154 %0 = bitcast i8 %__M to <8 x i1> 2155 %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer 2156 %2 = bitcast <8 x i32> %1 to <4 x i64> 2157 ret <4 x i64> %2 2158} 2159 2160define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) { 2161; CHECK-LABEL: test_mm512_cvtepi64_epi16: 2162; CHECK: # %bb.0: # %entry 2163; CHECK-NEXT: vpmovqw %zmm0, %xmm0 2164; CHECK-NEXT: vzeroupper 2165; CHECK-NEXT: ret{{[l|q]}} 2166entry: 2167 %conv.i = trunc <8 x i64> %__A to <8 x i16> 2168 %0 = bitcast <8 x i16> %conv.i to <2 x i64> 2169 ret <2 x i64> %0 2170} 2171 2172define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) { 2173; X86-LABEL: test_mm512_mask_cvtepi64_epi16: 2174; X86: # %bb.0: # %entry 2175; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2176; X86-NEXT: kmovw %eax, %k1 2177; X86-NEXT: vpmovqw %zmm1, %xmm0 {%k1} 2178; X86-NEXT: vzeroupper 2179; X86-NEXT: retl 2180; 2181; X64-LABEL: test_mm512_mask_cvtepi64_epi16: 2182; X64: # %bb.0: # %entry 2183; X64-NEXT: kmovw %edi, %k1 2184; X64-NEXT: vpmovqw %zmm1, %xmm0 {%k1} 2185; X64-NEXT: vzeroupper 2186; X64-NEXT: retq 2187entry: 2188 %0 = bitcast <2 x i64> %__O to <8 x i16> 2189 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M) 2190 %2 = bitcast <8 x i16> %1 to <2 x i64> 2191 ret <2 x i64> %2 2192} 2193 2194define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) { 2195; X86-LABEL: test_mm512_maskz_cvtepi64_epi16: 2196; X86: # %bb.0: # %entry 2197; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2198; X86-NEXT: kmovw %eax, %k1 2199; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z} 2200; X86-NEXT: vzeroupper 2201; X86-NEXT: retl 2202; 2203; X64-LABEL: test_mm512_maskz_cvtepi64_epi16: 2204; X64: # %bb.0: # %entry 2205; X64-NEXT: kmovw %edi, %k1 2206; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z} 2207; X64-NEXT: vzeroupper 2208; X64-NEXT: retq 2209entry: 2210 %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M) 2211 %1 = bitcast <8 x i16> %0 to <2 x i64> 2212 ret <2 x i64> %1 2213} 2214 2215declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) 2216declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) 2217 2218define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2219; CHECK-LABEL: test_mm512_ternarylogic_epi32: 2220; CHECK: # %bb.0: # %entry 2221; CHECK-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 2222; CHECK-NEXT: ret{{[l|q]}} 2223entry: 2224 %0 = bitcast <8 x i64> %__A to <16 x i32> 2225 %1 = bitcast <8 x i64> %__B to <16 x i32> 2226 %2 = bitcast <8 x i64> %__C to <16 x i32> 2227 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4) 2228 %4 = bitcast <16 x i32> %3 to <8 x i64> 2229 ret <8 x i64> %4 2230} 2231 2232declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1 2233 2234define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) { 2235; X86-LABEL: test_mm512_mask_ternarylogic_epi32: 2236; X86: # %bb.0: # %entry 2237; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2238; X86-NEXT: kmovw %eax, %k1 2239; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} 2240; X86-NEXT: retl 2241; 2242; X64-LABEL: test_mm512_mask_ternarylogic_epi32: 2243; X64: # %bb.0: # %entry 2244; X64-NEXT: kmovw %edi, %k1 2245; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} 2246; X64-NEXT: retq 2247entry: 2248 %0 = bitcast <8 x i64> %__A to <16 x i32> 2249 %1 = bitcast <8 x i64> %__B to <16 x i32> 2250 %2 = bitcast <8 x i64> %__C to <16 x i32> 2251 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4) 2252 %4 = bitcast i16 %__U to <16 x i1> 2253 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0 2254 %6 = bitcast <16 x i32> %5 to <8 x i64> 2255 ret <8 x i64> %6 2256} 2257 2258define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2259; X86-LABEL: test_mm512_maskz_ternarylogic_epi32: 2260; X86: # %bb.0: # %entry 2261; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2262; X86-NEXT: kmovw %eax, %k1 2263; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2264; X86-NEXT: retl 2265; 2266; X64-LABEL: test_mm512_maskz_ternarylogic_epi32: 2267; X64: # %bb.0: # %entry 2268; X64-NEXT: kmovw %edi, %k1 2269; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2270; X64-NEXT: retq 2271entry: 2272 %0 = bitcast <8 x i64> %__A to <16 x i32> 2273 %1 = bitcast <8 x i64> %__B to <16 x i32> 2274 %2 = bitcast <8 x i64> %__C to <16 x i32> 2275 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4) 2276 %4 = bitcast i16 %__U to <16 x i1> 2277 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 2278 %6 = bitcast <16 x i32> %5 to <8 x i64> 2279 ret <8 x i64> %6 2280} 2281 2282define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2283; CHECK-LABEL: test_mm512_ternarylogic_epi64: 2284; CHECK: # %bb.0: # %entry 2285; CHECK-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 2286; CHECK-NEXT: ret{{[l|q]}} 2287entry: 2288 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) 2289 ret <8 x i64> %0 2290} 2291 2292declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1 2293 2294define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) { 2295; X86-LABEL: test_mm512_mask_ternarylogic_epi64: 2296; X86: # %bb.0: # %entry 2297; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2298; X86-NEXT: kmovw %eax, %k1 2299; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} 2300; X86-NEXT: retl 2301; 2302; X64-LABEL: test_mm512_mask_ternarylogic_epi64: 2303; X64: # %bb.0: # %entry 2304; X64-NEXT: kmovw %edi, %k1 2305; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} 2306; X64-NEXT: retq 2307entry: 2308 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) 2309 %1 = bitcast i8 %__U to <8 x i1> 2310 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A 2311 ret <8 x i64> %2 2312} 2313 2314define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2315; X86-LABEL: test_mm512_maskz_ternarylogic_epi64: 2316; X86: # %bb.0: # %entry 2317; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2318; X86-NEXT: kmovw %eax, %k1 2319; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2320; X86-NEXT: retl 2321; 2322; X64-LABEL: test_mm512_maskz_ternarylogic_epi64: 2323; X64: # %bb.0: # %entry 2324; X64-NEXT: kmovw %edi, %k1 2325; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2326; X64-NEXT: retq 2327entry: 2328 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) 2329 %1 = bitcast i8 %__U to <8 x i1> 2330 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 2331 ret <8 x i64> %2 2332} 2333 2334declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) 2335 2336define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) { 2337; X86-LABEL: test_mm512_mask2_permutex2var_epi32: 2338; X86: # %bb.0: # %entry 2339; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2340; X86-NEXT: kmovw %eax, %k1 2341; X86-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1} 2342; X86-NEXT: vmovdqa64 %zmm1, %zmm0 2343; X86-NEXT: retl 2344; 2345; X64-LABEL: test_mm512_mask2_permutex2var_epi32: 2346; X64: # %bb.0: # %entry 2347; X64-NEXT: kmovw %edi, %k1 2348; X64-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1} 2349; X64-NEXT: vmovdqa64 %zmm1, %zmm0 2350; X64-NEXT: retq 2351entry: 2352 %0 = bitcast <8 x i64> %__A to <16 x i32> 2353 %1 = bitcast <8 x i64> %__I to <16 x i32> 2354 %2 = bitcast <8 x i64> %__B to <16 x i32> 2355 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2356 %4 = bitcast i16 %__U to <16 x i1> 2357 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1 2358 %6 = bitcast <16 x i32> %5 to <8 x i64> 2359 ret <8 x i64> %6 2360} 2361 2362declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>) 2363 2364define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) { 2365; X86-LABEL: test_mm512_mask2_permutex2var_pd: 2366; X86: # %bb.0: # %entry 2367; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2368; X86-NEXT: kmovw %eax, %k1 2369; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} 2370; X86-NEXT: vmovapd %zmm1, %zmm0 2371; X86-NEXT: retl 2372; 2373; X64-LABEL: test_mm512_mask2_permutex2var_pd: 2374; X64: # %bb.0: # %entry 2375; X64-NEXT: kmovw %edi, %k1 2376; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} 2377; X64-NEXT: vmovapd %zmm1, %zmm0 2378; X64-NEXT: retq 2379entry: 2380 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2381 %1 = bitcast <8 x i64> %__I to <8 x double> 2382 %2 = bitcast i8 %__U to <8 x i1> 2383 %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1 2384 ret <8 x double> %3 2385} 2386 2387declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>) 2388 2389define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) { 2390; X86-LABEL: test_mm512_mask2_permutex2var_ps: 2391; X86: # %bb.0: # %entry 2392; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2393; X86-NEXT: kmovw %eax, %k1 2394; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} 2395; X86-NEXT: vmovaps %zmm1, %zmm0 2396; X86-NEXT: retl 2397; 2398; X64-LABEL: test_mm512_mask2_permutex2var_ps: 2399; X64: # %bb.0: # %entry 2400; X64-NEXT: kmovw %edi, %k1 2401; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} 2402; X64-NEXT: vmovaps %zmm1, %zmm0 2403; X64-NEXT: retq 2404entry: 2405 %0 = bitcast <8 x i64> %__I to <16 x i32> 2406 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2407 %2 = bitcast <8 x i64> %__I to <16 x float> 2408 %3 = bitcast i16 %__U to <16 x i1> 2409 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2 2410 ret <16 x float> %4 2411} 2412 2413declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) 2414 2415define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) { 2416; X86-LABEL: test_mm512_mask2_permutex2var_epi64: 2417; X86: # %bb.0: # %entry 2418; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2419; X86-NEXT: kmovw %eax, %k1 2420; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} 2421; X86-NEXT: vmovdqa64 %zmm1, %zmm0 2422; X86-NEXT: retl 2423; 2424; X64-LABEL: test_mm512_mask2_permutex2var_epi64: 2425; X64: # %bb.0: # %entry 2426; X64-NEXT: kmovw %edi, %k1 2427; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} 2428; X64-NEXT: vmovdqa64 %zmm1, %zmm0 2429; X64-NEXT: retq 2430entry: 2431 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2432 %1 = bitcast i8 %__U to <8 x i1> 2433 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I 2434 ret <8 x i64> %2 2435} 2436 2437define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2438; CHECK-LABEL: test_mm512_permutex2var_epi32: 2439; CHECK: # %bb.0: # %entry 2440; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 2441; CHECK-NEXT: ret{{[l|q]}} 2442entry: 2443 %0 = bitcast <8 x i64> %__A to <16 x i32> 2444 %1 = bitcast <8 x i64> %__I to <16 x i32> 2445 %2 = bitcast <8 x i64> %__B to <16 x i32> 2446 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2447 %4 = bitcast <16 x i32> %3 to <8 x i64> 2448 ret <8 x i64> %4 2449} 2450 2451define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2452; X86-LABEL: test_mm512_maskz_permutex2var_epi32: 2453; X86: # %bb.0: # %entry 2454; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2455; X86-NEXT: kmovw %eax, %k1 2456; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z} 2457; X86-NEXT: retl 2458; 2459; X64-LABEL: test_mm512_maskz_permutex2var_epi32: 2460; X64: # %bb.0: # %entry 2461; X64-NEXT: kmovw %edi, %k1 2462; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z} 2463; X64-NEXT: retq 2464entry: 2465 %0 = bitcast <8 x i64> %__A to <16 x i32> 2466 %1 = bitcast <8 x i64> %__I to <16 x i32> 2467 %2 = bitcast <8 x i64> %__B to <16 x i32> 2468 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2469 %4 = bitcast i16 %__U to <16 x i1> 2470 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 2471 %6 = bitcast <16 x i32> %5 to <8 x i64> 2472 ret <8 x i64> %6 2473} 2474 2475define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) { 2476; X86-LABEL: test_mm512_mask_permutex2var_epi32: 2477; X86: # %bb.0: # %entry 2478; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2479; X86-NEXT: kmovw %eax, %k1 2480; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} 2481; X86-NEXT: retl 2482; 2483; X64-LABEL: test_mm512_mask_permutex2var_epi32: 2484; X64: # %bb.0: # %entry 2485; X64-NEXT: kmovw %edi, %k1 2486; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} 2487; X64-NEXT: retq 2488entry: 2489 %0 = bitcast <8 x i64> %__A to <16 x i32> 2490 %1 = bitcast <8 x i64> %__I to <16 x i32> 2491 %2 = bitcast <8 x i64> %__B to <16 x i32> 2492 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2493 %4 = bitcast i16 %__U to <16 x i1> 2494 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0 2495 %6 = bitcast <16 x i32> %5 to <8 x i64> 2496 ret <8 x i64> %6 2497} 2498 2499define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) { 2500; CHECK-LABEL: test_mm512_permutex2var_pd: 2501; CHECK: # %bb.0: # %entry 2502; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 2503; CHECK-NEXT: ret{{[l|q]}} 2504entry: 2505 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2506 ret <8 x double> %0 2507} 2508 2509define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) { 2510; X86-LABEL: test_mm512_mask_permutex2var_pd: 2511; X86: # %bb.0: # %entry 2512; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2513; X86-NEXT: kmovw %eax, %k1 2514; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} 2515; X86-NEXT: retl 2516; 2517; X64-LABEL: test_mm512_mask_permutex2var_pd: 2518; X64: # %bb.0: # %entry 2519; X64-NEXT: kmovw %edi, %k1 2520; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} 2521; X64-NEXT: retq 2522entry: 2523 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2524 %1 = bitcast i8 %__U to <8 x i1> 2525 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 2526 ret <8 x double> %2 2527} 2528 2529define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) { 2530; X86-LABEL: test_mm512_maskz_permutex2var_pd: 2531; X86: # %bb.0: # %entry 2532; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2533; X86-NEXT: kmovw %eax, %k1 2534; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z} 2535; X86-NEXT: retl 2536; 2537; X64-LABEL: test_mm512_maskz_permutex2var_pd: 2538; X64: # %bb.0: # %entry 2539; X64-NEXT: kmovw %edi, %k1 2540; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z} 2541; X64-NEXT: retq 2542entry: 2543 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2544 %1 = bitcast i8 %__U to <8 x i1> 2545 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 2546 ret <8 x double> %2 2547} 2548 2549define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) { 2550; CHECK-LABEL: test_mm512_permutex2var_ps: 2551; CHECK: # %bb.0: # %entry 2552; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 2553; CHECK-NEXT: ret{{[l|q]}} 2554entry: 2555 %0 = bitcast <8 x i64> %__I to <16 x i32> 2556 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2557 ret <16 x float> %1 2558} 2559 2560define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) { 2561; X86-LABEL: test_mm512_mask_permutex2var_ps: 2562; X86: # %bb.0: # %entry 2563; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2564; X86-NEXT: kmovw %eax, %k1 2565; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} 2566; X86-NEXT: retl 2567; 2568; X64-LABEL: test_mm512_mask_permutex2var_ps: 2569; X64: # %bb.0: # %entry 2570; X64-NEXT: kmovw %edi, %k1 2571; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} 2572; X64-NEXT: retq 2573entry: 2574 %0 = bitcast <8 x i64> %__I to <16 x i32> 2575 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2576 %2 = bitcast i16 %__U to <16 x i1> 2577 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A 2578 ret <16 x float> %3 2579} 2580 2581define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) { 2582; X86-LABEL: test_mm512_maskz_permutex2var_ps: 2583; X86: # %bb.0: # %entry 2584; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2585; X86-NEXT: kmovw %eax, %k1 2586; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z} 2587; X86-NEXT: retl 2588; 2589; X64-LABEL: test_mm512_maskz_permutex2var_ps: 2590; X64: # %bb.0: # %entry 2591; X64-NEXT: kmovw %edi, %k1 2592; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z} 2593; X64-NEXT: retq 2594entry: 2595 %0 = bitcast <8 x i64> %__I to <16 x i32> 2596 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2597 %2 = bitcast i16 %__U to <16 x i1> 2598 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer 2599 ret <16 x float> %3 2600} 2601 2602define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2603; CHECK-LABEL: test_mm512_permutex2var_epi64: 2604; CHECK: # %bb.0: # %entry 2605; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 2606; CHECK-NEXT: ret{{[l|q]}} 2607entry: 2608 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2609 ret <8 x i64> %0 2610} 2611 2612define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) { 2613; X86-LABEL: test_mm512_mask_permutex2var_epi64: 2614; X86: # %bb.0: # %entry 2615; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2616; X86-NEXT: kmovw %eax, %k1 2617; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} 2618; X86-NEXT: retl 2619; 2620; X64-LABEL: test_mm512_mask_permutex2var_epi64: 2621; X64: # %bb.0: # %entry 2622; X64-NEXT: kmovw %edi, %k1 2623; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} 2624; X64-NEXT: retq 2625entry: 2626 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2627 %1 = bitcast i8 %__U to <8 x i1> 2628 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A 2629 ret <8 x i64> %2 2630} 2631 2632define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2633; X86-LABEL: test_mm512_maskz_permutex2var_epi64: 2634; X86: # %bb.0: # %entry 2635; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2636; X86-NEXT: kmovw %eax, %k1 2637; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z} 2638; X86-NEXT: retl 2639; 2640; X64-LABEL: test_mm512_maskz_permutex2var_epi64: 2641; X64: # %bb.0: # %entry 2642; X64-NEXT: kmovw %edi, %k1 2643; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z} 2644; X64-NEXT: retq 2645entry: 2646 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2647 %1 = bitcast i8 %__U to <8 x i1> 2648 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 2649 ret <8 x i64> %2 2650} 2651define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2652; X86-LABEL: test_mm_mask_add_ss: 2653; X86: # %bb.0: # %entry 2654; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2655; X86-NEXT: kmovw %eax, %k1 2656; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1} 2657; X86-NEXT: retl 2658; 2659; X64-LABEL: test_mm_mask_add_ss: 2660; X64: # %bb.0: # %entry 2661; X64-NEXT: kmovw %edi, %k1 2662; X64-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1} 2663; X64-NEXT: retq 2664entry: 2665 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2666 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2667 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i 2668 %0 = and i8 %__U, 1 2669 %tobool.i = icmp eq i8 %0, 0 2670 %vecext1.i = extractelement <4 x float> %__W, i32 0 2671 %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i 2672 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2673 ret <4 x float> %vecins.i 2674} 2675 2676define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2677; X86-LABEL: test_mm_maskz_add_ss: 2678; X86: # %bb.0: # %entry 2679; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2680; X86-NEXT: kmovw %eax, %k1 2681; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z} 2682; X86-NEXT: retl 2683; 2684; X64-LABEL: test_mm_maskz_add_ss: 2685; X64: # %bb.0: # %entry 2686; X64-NEXT: kmovw %edi, %k1 2687; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z} 2688; X64-NEXT: retq 2689entry: 2690 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2691 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2692 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i 2693 %0 = and i8 %__U, 1 2694 %tobool.i = icmp eq i8 %0, 0 2695 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i 2696 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2697 ret <4 x float> %vecins.i 2698} 2699 2700define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2701; X86-LABEL: test_mm_mask_add_sd: 2702; X86: # %bb.0: # %entry 2703; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2704; X86-NEXT: kmovw %eax, %k1 2705; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1} 2706; X86-NEXT: retl 2707; 2708; X64-LABEL: test_mm_mask_add_sd: 2709; X64: # %bb.0: # %entry 2710; X64-NEXT: kmovw %edi, %k1 2711; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1} 2712; X64-NEXT: retq 2713entry: 2714 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2715 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2716 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i 2717 %0 = and i8 %__U, 1 2718 %tobool.i = icmp eq i8 %0, 0 2719 %vecext1.i = extractelement <2 x double> %__W, i32 0 2720 %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i 2721 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2722 ret <2 x double> %vecins.i 2723} 2724 2725define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2726; X86-LABEL: test_mm_maskz_add_sd: 2727; X86: # %bb.0: # %entry 2728; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2729; X86-NEXT: kmovw %eax, %k1 2730; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2731; X86-NEXT: retl 2732; 2733; X64-LABEL: test_mm_maskz_add_sd: 2734; X64: # %bb.0: # %entry 2735; X64-NEXT: kmovw %edi, %k1 2736; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2737; X64-NEXT: retq 2738entry: 2739 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2740 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2741 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i 2742 %0 = and i8 %__U, 1 2743 %tobool.i = icmp eq i8 %0, 0 2744 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i 2745 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2746 ret <2 x double> %vecins.i 2747} 2748 2749define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2750; X86-LABEL: test_mm_mask_sub_ss: 2751; X86: # %bb.0: # %entry 2752; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2753; X86-NEXT: kmovw %eax, %k1 2754; X86-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1} 2755; X86-NEXT: retl 2756; 2757; X64-LABEL: test_mm_mask_sub_ss: 2758; X64: # %bb.0: # %entry 2759; X64-NEXT: kmovw %edi, %k1 2760; X64-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1} 2761; X64-NEXT: retq 2762entry: 2763 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2764 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2765 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i 2766 %0 = and i8 %__U, 1 2767 %tobool.i = icmp eq i8 %0, 0 2768 %vecext1.i = extractelement <4 x float> %__W, i32 0 2769 %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i 2770 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2771 ret <4 x float> %vecins.i 2772} 2773 2774define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2775; X86-LABEL: test_mm_maskz_sub_ss: 2776; X86: # %bb.0: # %entry 2777; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2778; X86-NEXT: kmovw %eax, %k1 2779; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z} 2780; X86-NEXT: retl 2781; 2782; X64-LABEL: test_mm_maskz_sub_ss: 2783; X64: # %bb.0: # %entry 2784; X64-NEXT: kmovw %edi, %k1 2785; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z} 2786; X64-NEXT: retq 2787entry: 2788 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2789 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2790 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i 2791 %0 = and i8 %__U, 1 2792 %tobool.i = icmp eq i8 %0, 0 2793 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i 2794 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2795 ret <4 x float> %vecins.i 2796} 2797 2798define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2799; X86-LABEL: test_mm_mask_sub_sd: 2800; X86: # %bb.0: # %entry 2801; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2802; X86-NEXT: kmovw %eax, %k1 2803; X86-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1} 2804; X86-NEXT: retl 2805; 2806; X64-LABEL: test_mm_mask_sub_sd: 2807; X64: # %bb.0: # %entry 2808; X64-NEXT: kmovw %edi, %k1 2809; X64-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1} 2810; X64-NEXT: retq 2811entry: 2812 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2813 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2814 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i 2815 %0 = and i8 %__U, 1 2816 %tobool.i = icmp eq i8 %0, 0 2817 %vecext1.i = extractelement <2 x double> %__W, i32 0 2818 %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i 2819 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2820 ret <2 x double> %vecins.i 2821} 2822 2823define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2824; X86-LABEL: test_mm_maskz_sub_sd: 2825; X86: # %bb.0: # %entry 2826; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2827; X86-NEXT: kmovw %eax, %k1 2828; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2829; X86-NEXT: retl 2830; 2831; X64-LABEL: test_mm_maskz_sub_sd: 2832; X64: # %bb.0: # %entry 2833; X64-NEXT: kmovw %edi, %k1 2834; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2835; X64-NEXT: retq 2836entry: 2837 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2838 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2839 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i 2840 %0 = and i8 %__U, 1 2841 %tobool.i = icmp eq i8 %0, 0 2842 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i 2843 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2844 ret <2 x double> %vecins.i 2845} 2846 2847define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2848; X86-LABEL: test_mm_mask_mul_ss: 2849; X86: # %bb.0: # %entry 2850; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2851; X86-NEXT: kmovw %eax, %k1 2852; X86-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1} 2853; X86-NEXT: retl 2854; 2855; X64-LABEL: test_mm_mask_mul_ss: 2856; X64: # %bb.0: # %entry 2857; X64-NEXT: kmovw %edi, %k1 2858; X64-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1} 2859; X64-NEXT: retq 2860entry: 2861 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2862 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2863 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i 2864 %0 = and i8 %__U, 1 2865 %tobool.i = icmp eq i8 %0, 0 2866 %vecext1.i = extractelement <4 x float> %__W, i32 0 2867 %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i 2868 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2869 ret <4 x float> %vecins.i 2870} 2871 2872define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2873; X86-LABEL: test_mm_maskz_mul_ss: 2874; X86: # %bb.0: # %entry 2875; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2876; X86-NEXT: kmovw %eax, %k1 2877; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z} 2878; X86-NEXT: retl 2879; 2880; X64-LABEL: test_mm_maskz_mul_ss: 2881; X64: # %bb.0: # %entry 2882; X64-NEXT: kmovw %edi, %k1 2883; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z} 2884; X64-NEXT: retq 2885entry: 2886 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2887 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2888 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i 2889 %0 = and i8 %__U, 1 2890 %tobool.i = icmp eq i8 %0, 0 2891 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i 2892 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2893 ret <4 x float> %vecins.i 2894} 2895 2896define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2897; X86-LABEL: test_mm_mask_mul_sd: 2898; X86: # %bb.0: # %entry 2899; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2900; X86-NEXT: kmovw %eax, %k1 2901; X86-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1} 2902; X86-NEXT: retl 2903; 2904; X64-LABEL: test_mm_mask_mul_sd: 2905; X64: # %bb.0: # %entry 2906; X64-NEXT: kmovw %edi, %k1 2907; X64-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1} 2908; X64-NEXT: retq 2909entry: 2910 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2911 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2912 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i 2913 %0 = and i8 %__U, 1 2914 %tobool.i = icmp eq i8 %0, 0 2915 %vecext1.i = extractelement <2 x double> %__W, i32 0 2916 %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i 2917 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2918 ret <2 x double> %vecins.i 2919} 2920 2921define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2922; X86-LABEL: test_mm_maskz_mul_sd: 2923; X86: # %bb.0: # %entry 2924; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2925; X86-NEXT: kmovw %eax, %k1 2926; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2927; X86-NEXT: retl 2928; 2929; X64-LABEL: test_mm_maskz_mul_sd: 2930; X64: # %bb.0: # %entry 2931; X64-NEXT: kmovw %edi, %k1 2932; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2933; X64-NEXT: retq 2934entry: 2935 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2936 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2937 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i 2938 %0 = and i8 %__U, 1 2939 %tobool.i = icmp eq i8 %0, 0 2940 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i 2941 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2942 ret <2 x double> %vecins.i 2943} 2944 2945define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2946; X86-LABEL: test_mm_mask_div_ss: 2947; X86: # %bb.0: # %entry 2948; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2949; X86-NEXT: kmovw %eax, %k1 2950; X86-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1} 2951; X86-NEXT: retl 2952; 2953; X64-LABEL: test_mm_mask_div_ss: 2954; X64: # %bb.0: # %entry 2955; X64-NEXT: kmovw %edi, %k1 2956; X64-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1} 2957; X64-NEXT: retq 2958entry: 2959 %0 = extractelement <4 x float> %__A, i64 0 2960 %1 = extractelement <4 x float> %__B, i64 0 2961 %2 = extractelement <4 x float> %__W, i64 0 2962 %3 = fdiv float %0, %1 2963 %4 = bitcast i8 %__U to <8 x i1> 2964 %5 = extractelement <8 x i1> %4, i64 0 2965 %6 = select i1 %5, float %3, float %2 2966 %7 = insertelement <4 x float> %__A, float %6, i64 0 2967 ret <4 x float> %7 2968} 2969 2970define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2971; X86-LABEL: test_mm_maskz_div_ss: 2972; X86: # %bb.0: # %entry 2973; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2974; X86-NEXT: kmovw %eax, %k1 2975; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z} 2976; X86-NEXT: retl 2977; 2978; X64-LABEL: test_mm_maskz_div_ss: 2979; X64: # %bb.0: # %entry 2980; X64-NEXT: kmovw %edi, %k1 2981; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z} 2982; X64-NEXT: retq 2983entry: 2984 %0 = extractelement <4 x float> %__A, i64 0 2985 %1 = extractelement <4 x float> %__B, i64 0 2986 %2 = fdiv float %0, %1 2987 %3 = bitcast i8 %__U to <8 x i1> 2988 %4 = extractelement <8 x i1> %3, i64 0 2989 %5 = select i1 %4, float %2, float 0.000000e+00 2990 %6 = insertelement <4 x float> %__A, float %5, i64 0 2991 ret <4 x float> %6 2992} 2993 2994define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2995; X86-LABEL: test_mm_mask_div_sd: 2996; X86: # %bb.0: # %entry 2997; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2998; X86-NEXT: kmovw %eax, %k1 2999; X86-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1} 3000; X86-NEXT: retl 3001; 3002; X64-LABEL: test_mm_mask_div_sd: 3003; X64: # %bb.0: # %entry 3004; X64-NEXT: kmovw %edi, %k1 3005; X64-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1} 3006; X64-NEXT: retq 3007entry: 3008 %0 = extractelement <2 x double> %__A, i64 0 3009 %1 = extractelement <2 x double> %__B, i64 0 3010 %2 = extractelement <2 x double> %__W, i64 0 3011 %3 = fdiv double %0, %1 3012 %4 = bitcast i8 %__U to <8 x i1> 3013 %5 = extractelement <8 x i1> %4, i64 0 3014 %6 = select i1 %5, double %3, double %2 3015 %7 = insertelement <2 x double> %__A, double %6, i64 0 3016 ret <2 x double> %7 3017} 3018 3019define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 3020; X86-LABEL: test_mm_maskz_div_sd: 3021; X86: # %bb.0: # %entry 3022; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3023; X86-NEXT: kmovw %eax, %k1 3024; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z} 3025; X86-NEXT: retl 3026; 3027; X64-LABEL: test_mm_maskz_div_sd: 3028; X64: # %bb.0: # %entry 3029; X64-NEXT: kmovw %edi, %k1 3030; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z} 3031; X64-NEXT: retq 3032entry: 3033 %0 = extractelement <2 x double> %__A, i64 0 3034 %1 = extractelement <2 x double> %__B, i64 0 3035 %2 = fdiv double %0, %1 3036 %3 = bitcast i8 %__U to <8 x i1> 3037 %4 = extractelement <8 x i1> %3, i64 0 3038 %5 = select i1 %4, double %2, double 0.000000e+00 3039 %6 = insertelement <2 x double> %__A, double %5, i64 0 3040 ret <2 x double> %6 3041} 3042 3043 3044define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3045; CHECK-LABEL: test_mm512_fmadd_round_pd: 3046; CHECK: # %bb.0: # %entry 3047; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3048; CHECK-NEXT: ret{{[l|q]}} 3049entry: 3050 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3051 ret <8 x double> %0 3052} 3053 3054declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1 3055 3056define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3057; X86-LABEL: test_mm512_mask_fmadd_round_pd: 3058; X86: # %bb.0: # %entry 3059; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3060; X86-NEXT: kmovw %eax, %k1 3061; X86-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3062; X86-NEXT: retl 3063; 3064; X64-LABEL: test_mm512_mask_fmadd_round_pd: 3065; X64: # %bb.0: # %entry 3066; X64-NEXT: kmovw %edi, %k1 3067; X64-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3068; X64-NEXT: retq 3069entry: 3070 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3071 %1 = bitcast i8 %__U to <8 x i1> 3072 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3073 ret <8 x double> %2 3074} 3075 3076define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3077; X86-LABEL: test_mm512_mask3_fmadd_round_pd: 3078; X86: # %bb.0: # %entry 3079; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3080; X86-NEXT: kmovw %eax, %k1 3081; X86-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3082; X86-NEXT: vmovapd %zmm2, %zmm0 3083; X86-NEXT: retl 3084; 3085; X64-LABEL: test_mm512_mask3_fmadd_round_pd: 3086; X64: # %bb.0: # %entry 3087; X64-NEXT: kmovw %edi, %k1 3088; X64-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3089; X64-NEXT: vmovapd %zmm2, %zmm0 3090; X64-NEXT: retq 3091entry: 3092 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3093 %1 = bitcast i8 %__U to <8 x i1> 3094 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3095 ret <8 x double> %2 3096} 3097 3098define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3099; X86-LABEL: test_mm512_maskz_fmadd_round_pd: 3100; X86: # %bb.0: # %entry 3101; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3102; X86-NEXT: kmovw %eax, %k1 3103; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3104; X86-NEXT: retl 3105; 3106; X64-LABEL: test_mm512_maskz_fmadd_round_pd: 3107; X64: # %bb.0: # %entry 3108; X64-NEXT: kmovw %edi, %k1 3109; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3110; X64-NEXT: retq 3111entry: 3112 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3113 %1 = bitcast i8 %__U to <8 x i1> 3114 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3115 ret <8 x double> %2 3116} 3117 3118define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3119; X86-LABEL: test_mm512_fmsub_round_pd: 3120; X86: # %bb.0: # %entry 3121; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2 3122; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3123; X86-NEXT: retl 3124; 3125; X64-LABEL: test_mm512_fmsub_round_pd: 3126; X64: # %bb.0: # %entry 3127; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3128; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3129; X64-NEXT: retq 3130entry: 3131 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3132 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3133 ret <8 x double> %0 3134} 3135 3136define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3137; X86-LABEL: test_mm512_mask_fmsub_round_pd: 3138; X86: # %bb.0: # %entry 3139; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3140; X86-NEXT: kmovw %eax, %k1 3141; X86-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3142; X86-NEXT: retl 3143; 3144; X64-LABEL: test_mm512_mask_fmsub_round_pd: 3145; X64: # %bb.0: # %entry 3146; X64-NEXT: kmovw %edi, %k1 3147; X64-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3148; X64-NEXT: retq 3149entry: 3150 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3151 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3152 %1 = bitcast i8 %__U to <8 x i1> 3153 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3154 ret <8 x double> %2 3155} 3156 3157define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3158; X86-LABEL: test_mm512_maskz_fmsub_round_pd: 3159; X86: # %bb.0: # %entry 3160; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3161; X86-NEXT: kmovw %eax, %k1 3162; X86-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3163; X86-NEXT: retl 3164; 3165; X64-LABEL: test_mm512_maskz_fmsub_round_pd: 3166; X64: # %bb.0: # %entry 3167; X64-NEXT: kmovw %edi, %k1 3168; X64-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3169; X64-NEXT: retq 3170entry: 3171 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3172 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3173 %1 = bitcast i8 %__U to <8 x i1> 3174 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3175 ret <8 x double> %2 3176} 3177 3178define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3179; X86-LABEL: test_mm512_fnmadd_round_pd: 3180; X86: # %bb.0: # %entry 3181; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0 3182; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3183; X86-NEXT: retl 3184; 3185; X64-LABEL: test_mm512_fnmadd_round_pd: 3186; X64: # %bb.0: # %entry 3187; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0 3188; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3189; X64-NEXT: retq 3190entry: 3191 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3192 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 3193 ret <8 x double> %0 3194} 3195 3196define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3197; X86-LABEL: test_mm512_mask3_fnmadd_round_pd: 3198; X86: # %bb.0: # %entry 3199; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3200; X86-NEXT: kmovw %eax, %k1 3201; X86-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3202; X86-NEXT: vmovapd %zmm2, %zmm0 3203; X86-NEXT: retl 3204; 3205; X64-LABEL: test_mm512_mask3_fnmadd_round_pd: 3206; X64: # %bb.0: # %entry 3207; X64-NEXT: kmovw %edi, %k1 3208; X64-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3209; X64-NEXT: vmovapd %zmm2, %zmm0 3210; X64-NEXT: retq 3211entry: 3212 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3213 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 3214 %1 = bitcast i8 %__U to <8 x i1> 3215 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3216 ret <8 x double> %2 3217} 3218 3219define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3220; X86-LABEL: test_mm512_maskz_fnmadd_round_pd: 3221; X86: # %bb.0: # %entry 3222; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3223; X86-NEXT: kmovw %eax, %k1 3224; X86-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3225; X86-NEXT: retl 3226; 3227; X64-LABEL: test_mm512_maskz_fnmadd_round_pd: 3228; X64: # %bb.0: # %entry 3229; X64-NEXT: kmovw %edi, %k1 3230; X64-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3231; X64-NEXT: retq 3232entry: 3233 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3234 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 3235 %1 = bitcast i8 %__U to <8 x i1> 3236 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3237 ret <8 x double> %2 3238} 3239 3240define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3241; CHECK-LABEL: test_mm512_fnmsub_round_pd: 3242; CHECK: # %bb.0: # %entry 3243; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 3244; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 3245; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 3246; CHECK-NEXT: vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0 3247; CHECK-NEXT: ret{{[l|q]}} 3248entry: 3249 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3250 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3251 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8) 3252 ret <8 x double> %0 3253} 3254 3255define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3256; X86-LABEL: test_mm512_maskz_fnmsub_round_pd: 3257; X86: # %bb.0: # %entry 3258; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3259; X86-NEXT: kmovw %eax, %k1 3260; X86-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3261; X86-NEXT: retl 3262; 3263; X64-LABEL: test_mm512_maskz_fnmsub_round_pd: 3264; X64: # %bb.0: # %entry 3265; X64-NEXT: kmovw %edi, %k1 3266; X64-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3267; X64-NEXT: retq 3268entry: 3269 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3270 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3271 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8) 3272 %1 = bitcast i8 %__U to <8 x i1> 3273 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3274 ret <8 x double> %2 3275} 3276 3277define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3278; CHECK-LABEL: test_mm512_fmadd_pd: 3279; CHECK: # %bb.0: # %entry 3280; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3281; CHECK-NEXT: ret{{[l|q]}} 3282entry: 3283 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3284 ret <8 x double> %0 3285} 3286 3287define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3288; X86-LABEL: test_mm512_mask_fmadd_pd: 3289; X86: # %bb.0: # %entry 3290; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3291; X86-NEXT: kmovw %eax, %k1 3292; X86-NEXT: vfmadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2 3293; X86-NEXT: retl 3294; 3295; X64-LABEL: test_mm512_mask_fmadd_pd: 3296; X64: # %bb.0: # %entry 3297; X64-NEXT: kmovw %edi, %k1 3298; X64-NEXT: vfmadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2 3299; X64-NEXT: retq 3300entry: 3301 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3302 %1 = bitcast i8 %__U to <8 x i1> 3303 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3304 ret <8 x double> %2 3305} 3306 3307define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3308; X86-LABEL: test_mm512_mask3_fmadd_pd: 3309; X86: # %bb.0: # %entry 3310; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3311; X86-NEXT: kmovw %eax, %k1 3312; X86-NEXT: vfmadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2 3313; X86-NEXT: vmovapd %zmm2, %zmm0 3314; X86-NEXT: retl 3315; 3316; X64-LABEL: test_mm512_mask3_fmadd_pd: 3317; X64: # %bb.0: # %entry 3318; X64-NEXT: kmovw %edi, %k1 3319; X64-NEXT: vfmadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2 3320; X64-NEXT: vmovapd %zmm2, %zmm0 3321; X64-NEXT: retq 3322entry: 3323 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3324 %1 = bitcast i8 %__U to <8 x i1> 3325 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3326 ret <8 x double> %2 3327} 3328 3329define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3330; X86-LABEL: test_mm512_maskz_fmadd_pd: 3331; X86: # %bb.0: # %entry 3332; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3333; X86-NEXT: kmovw %eax, %k1 3334; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2 3335; X86-NEXT: retl 3336; 3337; X64-LABEL: test_mm512_maskz_fmadd_pd: 3338; X64: # %bb.0: # %entry 3339; X64-NEXT: kmovw %edi, %k1 3340; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2 3341; X64-NEXT: retq 3342entry: 3343 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3344 %1 = bitcast i8 %__U to <8 x i1> 3345 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3346 ret <8 x double> %2 3347} 3348 3349define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3350; X86-LABEL: test_mm512_fmsub_pd: 3351; X86: # %bb.0: # %entry 3352; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2 3353; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3354; X86-NEXT: retl 3355; 3356; X64-LABEL: test_mm512_fmsub_pd: 3357; X64: # %bb.0: # %entry 3358; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3359; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3360; X64-NEXT: retq 3361entry: 3362 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3363 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 3364 ret <8 x double> %0 3365} 3366 3367define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3368; X86-LABEL: test_mm512_mask_fmsub_pd: 3369; X86: # %bb.0: # %entry 3370; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3371; X86-NEXT: kmovw %eax, %k1 3372; X86-NEXT: vfmsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2 3373; X86-NEXT: retl 3374; 3375; X64-LABEL: test_mm512_mask_fmsub_pd: 3376; X64: # %bb.0: # %entry 3377; X64-NEXT: kmovw %edi, %k1 3378; X64-NEXT: vfmsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2 3379; X64-NEXT: retq 3380entry: 3381 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3382 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 3383 %1 = bitcast i8 %__U to <8 x i1> 3384 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3385 ret <8 x double> %2 3386} 3387 3388define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3389; X86-LABEL: test_mm512_maskz_fmsub_pd: 3390; X86: # %bb.0: # %entry 3391; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3392; X86-NEXT: kmovw %eax, %k1 3393; X86-NEXT: vfmsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2 3394; X86-NEXT: retl 3395; 3396; X64-LABEL: test_mm512_maskz_fmsub_pd: 3397; X64: # %bb.0: # %entry 3398; X64-NEXT: kmovw %edi, %k1 3399; X64-NEXT: vfmsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2 3400; X64-NEXT: retq 3401entry: 3402 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3403 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 3404 %1 = bitcast i8 %__U to <8 x i1> 3405 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3406 ret <8 x double> %2 3407} 3408 3409define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3410; X86-LABEL: test_mm512_fnmadd_pd: 3411; X86: # %bb.0: # %entry 3412; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0 3413; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3414; X86-NEXT: retl 3415; 3416; X64-LABEL: test_mm512_fnmadd_pd: 3417; X64: # %bb.0: # %entry 3418; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0 3419; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3420; X64-NEXT: retq 3421entry: 3422 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3423 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 3424 ret <8 x double> %0 3425} 3426 3427define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3428; X86-LABEL: test_mm512_mask3_fnmadd_pd: 3429; X86: # %bb.0: # %entry 3430; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3431; X86-NEXT: kmovw %eax, %k1 3432; X86-NEXT: vfnmadd231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2 3433; X86-NEXT: vmovapd %zmm2, %zmm0 3434; X86-NEXT: retl 3435; 3436; X64-LABEL: test_mm512_mask3_fnmadd_pd: 3437; X64: # %bb.0: # %entry 3438; X64-NEXT: kmovw %edi, %k1 3439; X64-NEXT: vfnmadd231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2 3440; X64-NEXT: vmovapd %zmm2, %zmm0 3441; X64-NEXT: retq 3442entry: 3443 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3444 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 3445 %1 = bitcast i8 %__U to <8 x i1> 3446 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3447 ret <8 x double> %2 3448} 3449 3450define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3451; X86-LABEL: test_mm512_maskz_fnmadd_pd: 3452; X86: # %bb.0: # %entry 3453; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3454; X86-NEXT: kmovw %eax, %k1 3455; X86-NEXT: vfnmadd213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2 3456; X86-NEXT: retl 3457; 3458; X64-LABEL: test_mm512_maskz_fnmadd_pd: 3459; X64: # %bb.0: # %entry 3460; X64-NEXT: kmovw %edi, %k1 3461; X64-NEXT: vfnmadd213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2 3462; X64-NEXT: retq 3463entry: 3464 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3465 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 3466 %1 = bitcast i8 %__U to <8 x i1> 3467 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3468 ret <8 x double> %2 3469} 3470 3471define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3472; CHECK-LABEL: test_mm512_fnmsub_pd: 3473; CHECK: # %bb.0: # %entry 3474; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 3475; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 3476; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 3477; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0 3478; CHECK-NEXT: ret{{[l|q]}} 3479entry: 3480 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3481 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3482 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10 3483 ret <8 x double> %0 3484} 3485 3486define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3487; X86-LABEL: test_mm512_maskz_fnmsub_pd: 3488; X86: # %bb.0: # %entry 3489; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3490; X86-NEXT: kmovw %eax, %k1 3491; X86-NEXT: vfnmsub213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2 3492; X86-NEXT: retl 3493; 3494; X64-LABEL: test_mm512_maskz_fnmsub_pd: 3495; X64: # %bb.0: # %entry 3496; X64-NEXT: kmovw %edi, %k1 3497; X64-NEXT: vfnmsub213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2 3498; X64-NEXT: retq 3499entry: 3500 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3501 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3502 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10 3503 %1 = bitcast i8 %__U to <8 x i1> 3504 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3505 ret <8 x double> %2 3506} 3507 3508define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3509; CHECK-LABEL: test_mm512_fmadd_round_ps: 3510; CHECK: # %bb.0: # %entry 3511; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3512; CHECK-NEXT: ret{{[l|q]}} 3513entry: 3514 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3515 ret <16 x float> %0 3516} 3517 3518declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1 3519 3520define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3521; X86-LABEL: test_mm512_mask_fmadd_round_ps: 3522; X86: # %bb.0: # %entry 3523; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3524; X86-NEXT: kmovw %eax, %k1 3525; X86-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3526; X86-NEXT: retl 3527; 3528; X64-LABEL: test_mm512_mask_fmadd_round_ps: 3529; X64: # %bb.0: # %entry 3530; X64-NEXT: kmovw %edi, %k1 3531; X64-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3532; X64-NEXT: retq 3533entry: 3534 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3535 %1 = bitcast i16 %__U to <16 x i1> 3536 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3537 ret <16 x float> %2 3538} 3539 3540define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3541; X86-LABEL: test_mm512_mask3_fmadd_round_ps: 3542; X86: # %bb.0: # %entry 3543; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3544; X86-NEXT: kmovw %eax, %k1 3545; X86-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3546; X86-NEXT: vmovaps %zmm2, %zmm0 3547; X86-NEXT: retl 3548; 3549; X64-LABEL: test_mm512_mask3_fmadd_round_ps: 3550; X64: # %bb.0: # %entry 3551; X64-NEXT: kmovw %edi, %k1 3552; X64-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3553; X64-NEXT: vmovaps %zmm2, %zmm0 3554; X64-NEXT: retq 3555entry: 3556 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3557 %1 = bitcast i16 %__U to <16 x i1> 3558 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3559 ret <16 x float> %2 3560} 3561 3562define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3563; X86-LABEL: test_mm512_maskz_fmadd_round_ps: 3564; X86: # %bb.0: # %entry 3565; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3566; X86-NEXT: kmovw %eax, %k1 3567; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3568; X86-NEXT: retl 3569; 3570; X64-LABEL: test_mm512_maskz_fmadd_round_ps: 3571; X64: # %bb.0: # %entry 3572; X64-NEXT: kmovw %edi, %k1 3573; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3574; X64-NEXT: retq 3575entry: 3576 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3577 %1 = bitcast i16 %__U to <16 x i1> 3578 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3579 ret <16 x float> %2 3580} 3581 3582define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3583; X86-LABEL: test_mm512_fmsub_round_ps: 3584; X86: # %bb.0: # %entry 3585; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2 3586; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3587; X86-NEXT: retl 3588; 3589; X64-LABEL: test_mm512_fmsub_round_ps: 3590; X64: # %bb.0: # %entry 3591; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2 3592; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3593; X64-NEXT: retq 3594entry: 3595 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3596 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 3597 ret <16 x float> %0 3598} 3599 3600define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3601; X86-LABEL: test_mm512_mask_fmsub_round_ps: 3602; X86: # %bb.0: # %entry 3603; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3604; X86-NEXT: kmovw %eax, %k1 3605; X86-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3606; X86-NEXT: retl 3607; 3608; X64-LABEL: test_mm512_mask_fmsub_round_ps: 3609; X64: # %bb.0: # %entry 3610; X64-NEXT: kmovw %edi, %k1 3611; X64-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3612; X64-NEXT: retq 3613entry: 3614 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3615 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 3616 %1 = bitcast i16 %__U to <16 x i1> 3617 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3618 ret <16 x float> %2 3619} 3620 3621define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3622; X86-LABEL: test_mm512_maskz_fmsub_round_ps: 3623; X86: # %bb.0: # %entry 3624; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3625; X86-NEXT: kmovw %eax, %k1 3626; X86-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3627; X86-NEXT: retl 3628; 3629; X64-LABEL: test_mm512_maskz_fmsub_round_ps: 3630; X64: # %bb.0: # %entry 3631; X64-NEXT: kmovw %edi, %k1 3632; X64-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3633; X64-NEXT: retq 3634entry: 3635 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3636 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 3637 %1 = bitcast i16 %__U to <16 x i1> 3638 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3639 ret <16 x float> %2 3640} 3641 3642define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3643; X86-LABEL: test_mm512_fnmadd_round_ps: 3644; X86: # %bb.0: # %entry 3645; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 3646; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3647; X86-NEXT: retl 3648; 3649; X64-LABEL: test_mm512_fnmadd_round_ps: 3650; X64: # %bb.0: # %entry 3651; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 3652; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3653; X64-NEXT: retq 3654entry: 3655 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3656 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 3657 ret <16 x float> %0 3658} 3659 3660define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3661; X86-LABEL: test_mm512_mask3_fnmadd_round_ps: 3662; X86: # %bb.0: # %entry 3663; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3664; X86-NEXT: kmovw %eax, %k1 3665; X86-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3666; X86-NEXT: vmovaps %zmm2, %zmm0 3667; X86-NEXT: retl 3668; 3669; X64-LABEL: test_mm512_mask3_fnmadd_round_ps: 3670; X64: # %bb.0: # %entry 3671; X64-NEXT: kmovw %edi, %k1 3672; X64-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3673; X64-NEXT: vmovaps %zmm2, %zmm0 3674; X64-NEXT: retq 3675entry: 3676 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3677 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 3678 %1 = bitcast i16 %__U to <16 x i1> 3679 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3680 ret <16 x float> %2 3681} 3682 3683define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3684; X86-LABEL: test_mm512_maskz_fnmadd_round_ps: 3685; X86: # %bb.0: # %entry 3686; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3687; X86-NEXT: kmovw %eax, %k1 3688; X86-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3689; X86-NEXT: retl 3690; 3691; X64-LABEL: test_mm512_maskz_fnmadd_round_ps: 3692; X64: # %bb.0: # %entry 3693; X64-NEXT: kmovw %edi, %k1 3694; X64-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3695; X64-NEXT: retq 3696entry: 3697 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3698 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 3699 %1 = bitcast i16 %__U to <16 x i1> 3700 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3701 ret <16 x float> %2 3702} 3703 3704define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3705; CHECK-LABEL: test_mm512_fnmsub_round_ps: 3706; CHECK: # %bb.0: # %entry 3707; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 3708; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4 3709; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0 3710; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0 3711; CHECK-NEXT: ret{{[l|q]}} 3712entry: 3713 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3714 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3715 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8) 3716 ret <16 x float> %0 3717} 3718 3719define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3720; X86-LABEL: test_mm512_maskz_fnmsub_round_ps: 3721; X86: # %bb.0: # %entry 3722; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3723; X86-NEXT: kmovw %eax, %k1 3724; X86-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3725; X86-NEXT: retl 3726; 3727; X64-LABEL: test_mm512_maskz_fnmsub_round_ps: 3728; X64: # %bb.0: # %entry 3729; X64-NEXT: kmovw %edi, %k1 3730; X64-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3731; X64-NEXT: retq 3732entry: 3733 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3734 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3735 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8) 3736 %1 = bitcast i16 %__U to <16 x i1> 3737 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3738 ret <16 x float> %2 3739} 3740 3741define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3742; CHECK-LABEL: test_mm512_fmadd_ps: 3743; CHECK: # %bb.0: # %entry 3744; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3745; CHECK-NEXT: ret{{[l|q]}} 3746entry: 3747 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3748 ret <16 x float> %0 3749} 3750 3751define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3752; X86-LABEL: test_mm512_mask_fmadd_ps: 3753; X86: # %bb.0: # %entry 3754; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3755; X86-NEXT: kmovw %eax, %k1 3756; X86-NEXT: vfmadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2 3757; X86-NEXT: retl 3758; 3759; X64-LABEL: test_mm512_mask_fmadd_ps: 3760; X64: # %bb.0: # %entry 3761; X64-NEXT: kmovw %edi, %k1 3762; X64-NEXT: vfmadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2 3763; X64-NEXT: retq 3764entry: 3765 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3766 %1 = bitcast i16 %__U to <16 x i1> 3767 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3768 ret <16 x float> %2 3769} 3770 3771define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3772; X86-LABEL: test_mm512_mask3_fmadd_ps: 3773; X86: # %bb.0: # %entry 3774; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3775; X86-NEXT: kmovw %eax, %k1 3776; X86-NEXT: vfmadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2 3777; X86-NEXT: vmovaps %zmm2, %zmm0 3778; X86-NEXT: retl 3779; 3780; X64-LABEL: test_mm512_mask3_fmadd_ps: 3781; X64: # %bb.0: # %entry 3782; X64-NEXT: kmovw %edi, %k1 3783; X64-NEXT: vfmadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2 3784; X64-NEXT: vmovaps %zmm2, %zmm0 3785; X64-NEXT: retq 3786entry: 3787 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3788 %1 = bitcast i16 %__U to <16 x i1> 3789 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3790 ret <16 x float> %2 3791} 3792 3793define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3794; X86-LABEL: test_mm512_maskz_fmadd_ps: 3795; X86: # %bb.0: # %entry 3796; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3797; X86-NEXT: kmovw %eax, %k1 3798; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2 3799; X86-NEXT: retl 3800; 3801; X64-LABEL: test_mm512_maskz_fmadd_ps: 3802; X64: # %bb.0: # %entry 3803; X64-NEXT: kmovw %edi, %k1 3804; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2 3805; X64-NEXT: retq 3806entry: 3807 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3808 %1 = bitcast i16 %__U to <16 x i1> 3809 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3810 ret <16 x float> %2 3811} 3812 3813define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3814; X86-LABEL: test_mm512_fmsub_ps: 3815; X86: # %bb.0: # %entry 3816; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2 3817; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3818; X86-NEXT: retl 3819; 3820; X64-LABEL: test_mm512_fmsub_ps: 3821; X64: # %bb.0: # %entry 3822; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2 3823; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3824; X64-NEXT: retq 3825entry: 3826 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3827 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 3828 ret <16 x float> %0 3829} 3830 3831define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3832; X86-LABEL: test_mm512_mask_fmsub_ps: 3833; X86: # %bb.0: # %entry 3834; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3835; X86-NEXT: kmovw %eax, %k1 3836; X86-NEXT: vfmsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2 3837; X86-NEXT: retl 3838; 3839; X64-LABEL: test_mm512_mask_fmsub_ps: 3840; X64: # %bb.0: # %entry 3841; X64-NEXT: kmovw %edi, %k1 3842; X64-NEXT: vfmsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2 3843; X64-NEXT: retq 3844entry: 3845 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3846 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 3847 %1 = bitcast i16 %__U to <16 x i1> 3848 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3849 ret <16 x float> %2 3850} 3851 3852define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3853; X86-LABEL: test_mm512_maskz_fmsub_ps: 3854; X86: # %bb.0: # %entry 3855; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3856; X86-NEXT: kmovw %eax, %k1 3857; X86-NEXT: vfmsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2 3858; X86-NEXT: retl 3859; 3860; X64-LABEL: test_mm512_maskz_fmsub_ps: 3861; X64: # %bb.0: # %entry 3862; X64-NEXT: kmovw %edi, %k1 3863; X64-NEXT: vfmsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2 3864; X64-NEXT: retq 3865entry: 3866 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3867 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 3868 %1 = bitcast i16 %__U to <16 x i1> 3869 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3870 ret <16 x float> %2 3871} 3872 3873define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3874; X86-LABEL: test_mm512_fnmadd_ps: 3875; X86: # %bb.0: # %entry 3876; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 3877; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3878; X86-NEXT: retl 3879; 3880; X64-LABEL: test_mm512_fnmadd_ps: 3881; X64: # %bb.0: # %entry 3882; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 3883; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3884; X64-NEXT: retq 3885entry: 3886 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3887 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 3888 ret <16 x float> %0 3889} 3890 3891define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3892; X86-LABEL: test_mm512_mask3_fnmadd_ps: 3893; X86: # %bb.0: # %entry 3894; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3895; X86-NEXT: kmovw %eax, %k1 3896; X86-NEXT: vfnmadd231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2 3897; X86-NEXT: vmovaps %zmm2, %zmm0 3898; X86-NEXT: retl 3899; 3900; X64-LABEL: test_mm512_mask3_fnmadd_ps: 3901; X64: # %bb.0: # %entry 3902; X64-NEXT: kmovw %edi, %k1 3903; X64-NEXT: vfnmadd231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2 3904; X64-NEXT: vmovaps %zmm2, %zmm0 3905; X64-NEXT: retq 3906entry: 3907 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3908 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 3909 %1 = bitcast i16 %__U to <16 x i1> 3910 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3911 ret <16 x float> %2 3912} 3913 3914define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3915; X86-LABEL: test_mm512_maskz_fnmadd_ps: 3916; X86: # %bb.0: # %entry 3917; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3918; X86-NEXT: kmovw %eax, %k1 3919; X86-NEXT: vfnmadd213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2 3920; X86-NEXT: retl 3921; 3922; X64-LABEL: test_mm512_maskz_fnmadd_ps: 3923; X64: # %bb.0: # %entry 3924; X64-NEXT: kmovw %edi, %k1 3925; X64-NEXT: vfnmadd213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2 3926; X64-NEXT: retq 3927entry: 3928 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3929 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 3930 %1 = bitcast i16 %__U to <16 x i1> 3931 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3932 ret <16 x float> %2 3933} 3934 3935define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3936; CHECK-LABEL: test_mm512_fnmsub_ps: 3937; CHECK: # %bb.0: # %entry 3938; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 3939; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4 3940; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0 3941; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0 3942; CHECK-NEXT: ret{{[l|q]}} 3943entry: 3944 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3945 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3946 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10 3947 ret <16 x float> %0 3948} 3949 3950define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3951; X86-LABEL: test_mm512_maskz_fnmsub_ps: 3952; X86: # %bb.0: # %entry 3953; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 3954; X86-NEXT: kmovw %eax, %k1 3955; X86-NEXT: vfnmsub213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2 3956; X86-NEXT: retl 3957; 3958; X64-LABEL: test_mm512_maskz_fnmsub_ps: 3959; X64: # %bb.0: # %entry 3960; X64-NEXT: kmovw %edi, %k1 3961; X64-NEXT: vfnmsub213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2 3962; X64-NEXT: retq 3963entry: 3964 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3965 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3966 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10 3967 %1 = bitcast i16 %__U to <16 x i1> 3968 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3969 ret <16 x float> %2 3970} 3971 3972define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3973; CHECK-LABEL: test_mm512_fmaddsub_round_pd: 3974; CHECK: # %bb.0: # %entry 3975; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3976; CHECK-NEXT: ret{{[l|q]}} 3977entry: 3978 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3979 ret <8 x double> %0 3980} 3981 3982declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1 3983 3984define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3985; X86-LABEL: test_mm512_mask_fmaddsub_round_pd: 3986; X86: # %bb.0: # %entry 3987; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3988; X86-NEXT: kmovw %eax, %k1 3989; X86-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3990; X86-NEXT: retl 3991; 3992; X64-LABEL: test_mm512_mask_fmaddsub_round_pd: 3993; X64: # %bb.0: # %entry 3994; X64-NEXT: kmovw %edi, %k1 3995; X64-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3996; X64-NEXT: retq 3997entry: 3998 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3999 %1 = bitcast i8 %__U to <8 x i1> 4000 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4001 ret <8 x double> %2 4002} 4003 4004define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4005; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd: 4006; X86: # %bb.0: # %entry 4007; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4008; X86-NEXT: kmovw %eax, %k1 4009; X86-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4010; X86-NEXT: vmovapd %zmm2, %zmm0 4011; X86-NEXT: retl 4012; 4013; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd: 4014; X64: # %bb.0: # %entry 4015; X64-NEXT: kmovw %edi, %k1 4016; X64-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4017; X64-NEXT: vmovapd %zmm2, %zmm0 4018; X64-NEXT: retq 4019entry: 4020 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 4021 %1 = bitcast i8 %__U to <8 x i1> 4022 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4023 ret <8 x double> %2 4024} 4025 4026define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4027; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd: 4028; X86: # %bb.0: # %entry 4029; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4030; X86-NEXT: kmovw %eax, %k1 4031; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4032; X86-NEXT: retl 4033; 4034; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd: 4035; X64: # %bb.0: # %entry 4036; X64-NEXT: kmovw %edi, %k1 4037; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4038; X64-NEXT: retq 4039entry: 4040 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 4041 %1 = bitcast i8 %__U to <8 x i1> 4042 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 4043 ret <8 x double> %2 4044} 4045 4046define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4047; X86-LABEL: test_mm512_fmsubadd_round_pd: 4048; X86: # %bb.0: # %entry 4049; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2 4050; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 4051; X86-NEXT: retl 4052; 4053; X64-LABEL: test_mm512_fmsubadd_round_pd: 4054; X64: # %bb.0: # %entry 4055; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2 4056; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 4057; X64-NEXT: retq 4058entry: 4059 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4060 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4061 ret <8 x double> %0 4062} 4063 4064define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4065; X86-LABEL: test_mm512_mask_fmsubadd_round_pd: 4066; X86: # %bb.0: # %entry 4067; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4068; X86-NEXT: kmovw %eax, %k1 4069; X86-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4070; X86-NEXT: retl 4071; 4072; X64-LABEL: test_mm512_mask_fmsubadd_round_pd: 4073; X64: # %bb.0: # %entry 4074; X64-NEXT: kmovw %edi, %k1 4075; X64-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4076; X64-NEXT: retq 4077entry: 4078 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4079 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4080 %1 = bitcast i8 %__U to <8 x i1> 4081 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4082 ret <8 x double> %2 4083} 4084 4085define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4086; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd: 4087; X86: # %bb.0: # %entry 4088; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4089; X86-NEXT: kmovw %eax, %k1 4090; X86-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4091; X86-NEXT: retl 4092; 4093; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd: 4094; X64: # %bb.0: # %entry 4095; X64-NEXT: kmovw %edi, %k1 4096; X64-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4097; X64-NEXT: retq 4098entry: 4099 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4100 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4101 %1 = bitcast i8 %__U to <8 x i1> 4102 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 4103 ret <8 x double> %2 4104} 4105 4106define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4107; CHECK-LABEL: test_mm512_fmaddsub_pd: 4108; CHECK: # %bb.0: # %entry 4109; CHECK-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4110; CHECK-NEXT: ret{{[l|q]}} 4111entry: 4112 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4113 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4114 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4115 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4116 ret <8 x double> %3 4117} 4118 4119define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4120; X86-LABEL: test_mm512_mask_fmaddsub_pd: 4121; X86: # %bb.0: # %entry 4122; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4123; X86-NEXT: kmovw %eax, %k1 4124; X86-NEXT: vfmaddsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2 4125; X86-NEXT: retl 4126; 4127; X64-LABEL: test_mm512_mask_fmaddsub_pd: 4128; X64: # %bb.0: # %entry 4129; X64-NEXT: kmovw %edi, %k1 4130; X64-NEXT: vfmaddsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2 4131; X64-NEXT: retq 4132entry: 4133 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4134 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4135 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4136 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4137 %4 = bitcast i8 %__U to <8 x i1> 4138 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A 4139 ret <8 x double> %5 4140} 4141 4142define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4143; X86-LABEL: test_mm512_mask3_fmaddsub_pd: 4144; X86: # %bb.0: # %entry 4145; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4146; X86-NEXT: kmovw %eax, %k1 4147; X86-NEXT: vfmaddsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2 4148; X86-NEXT: vmovapd %zmm2, %zmm0 4149; X86-NEXT: retl 4150; 4151; X64-LABEL: test_mm512_mask3_fmaddsub_pd: 4152; X64: # %bb.0: # %entry 4153; X64-NEXT: kmovw %edi, %k1 4154; X64-NEXT: vfmaddsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2 4155; X64-NEXT: vmovapd %zmm2, %zmm0 4156; X64-NEXT: retq 4157entry: 4158 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4159 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4160 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4161 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4162 %4 = bitcast i8 %__U to <8 x i1> 4163 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C 4164 ret <8 x double> %5 4165} 4166 4167define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4168; X86-LABEL: test_mm512_maskz_fmaddsub_pd: 4169; X86: # %bb.0: # %entry 4170; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4171; X86-NEXT: kmovw %eax, %k1 4172; X86-NEXT: vfmaddsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2 4173; X86-NEXT: retl 4174; 4175; X64-LABEL: test_mm512_maskz_fmaddsub_pd: 4176; X64: # %bb.0: # %entry 4177; X64-NEXT: kmovw %edi, %k1 4178; X64-NEXT: vfmaddsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2 4179; X64-NEXT: retq 4180entry: 4181 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4182 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4183 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4184 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4185 %4 = bitcast i8 %__U to <8 x i1> 4186 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer 4187 ret <8 x double> %5 4188} 4189 4190define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4191; CHECK-LABEL: test_mm512_fmsubadd_pd: 4192; CHECK: # %bb.0: # %entry 4193; CHECK-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4194; CHECK-NEXT: ret{{[l|q]}} 4195entry: 4196 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4197 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4198 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4199 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4200 ret <8 x double> %2 4201} 4202 4203define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4204; X86-LABEL: test_mm512_mask_fmsubadd_pd: 4205; X86: # %bb.0: # %entry 4206; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4207; X86-NEXT: kmovw %eax, %k1 4208; X86-NEXT: vfmsubadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2 4209; X86-NEXT: retl 4210; 4211; X64-LABEL: test_mm512_mask_fmsubadd_pd: 4212; X64: # %bb.0: # %entry 4213; X64-NEXT: kmovw %edi, %k1 4214; X64-NEXT: vfmsubadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2 4215; X64-NEXT: retq 4216entry: 4217 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4218 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4219 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4220 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4221 %3 = bitcast i8 %__U to <8 x i1> 4222 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A 4223 ret <8 x double> %4 4224} 4225 4226define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4227; X86-LABEL: test_mm512_maskz_fmsubadd_pd: 4228; X86: # %bb.0: # %entry 4229; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4230; X86-NEXT: kmovw %eax, %k1 4231; X86-NEXT: vfmsubadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2 4232; X86-NEXT: retl 4233; 4234; X64-LABEL: test_mm512_maskz_fmsubadd_pd: 4235; X64: # %bb.0: # %entry 4236; X64-NEXT: kmovw %edi, %k1 4237; X64-NEXT: vfmsubadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2 4238; X64-NEXT: retq 4239entry: 4240 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4241 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4242 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4243 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4244 %3 = bitcast i8 %__U to <8 x i1> 4245 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 4246 ret <8 x double> %4 4247} 4248 4249define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4250; CHECK-LABEL: test_mm512_fmaddsub_round_ps: 4251; CHECK: # %bb.0: # %entry 4252; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 4253; CHECK-NEXT: ret{{[l|q]}} 4254entry: 4255 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4256 ret <16 x float> %0 4257} 4258 4259declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1 4260 4261define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4262; X86-LABEL: test_mm512_mask_fmaddsub_round_ps: 4263; X86: # %bb.0: # %entry 4264; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4265; X86-NEXT: kmovw %eax, %k1 4266; X86-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4267; X86-NEXT: retl 4268; 4269; X64-LABEL: test_mm512_mask_fmaddsub_round_ps: 4270; X64: # %bb.0: # %entry 4271; X64-NEXT: kmovw %edi, %k1 4272; X64-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4273; X64-NEXT: retq 4274entry: 4275 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4276 %1 = bitcast i16 %__U to <16 x i1> 4277 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4278 ret <16 x float> %2 4279} 4280 4281define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4282; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps: 4283; X86: # %bb.0: # %entry 4284; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4285; X86-NEXT: kmovw %eax, %k1 4286; X86-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4287; X86-NEXT: vmovaps %zmm2, %zmm0 4288; X86-NEXT: retl 4289; 4290; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps: 4291; X64: # %bb.0: # %entry 4292; X64-NEXT: kmovw %edi, %k1 4293; X64-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4294; X64-NEXT: vmovaps %zmm2, %zmm0 4295; X64-NEXT: retq 4296entry: 4297 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4298 %1 = bitcast i16 %__U to <16 x i1> 4299 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4300 ret <16 x float> %2 4301} 4302 4303define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4304; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps: 4305; X86: # %bb.0: # %entry 4306; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4307; X86-NEXT: kmovw %eax, %k1 4308; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4309; X86-NEXT: retl 4310; 4311; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps: 4312; X64: # %bb.0: # %entry 4313; X64-NEXT: kmovw %edi, %k1 4314; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4315; X64-NEXT: retq 4316entry: 4317 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4318 %1 = bitcast i16 %__U to <16 x i1> 4319 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 4320 ret <16 x float> %2 4321} 4322 4323define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4324; X86-LABEL: test_mm512_fmsubadd_round_ps: 4325; X86: # %bb.0: # %entry 4326; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2 4327; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 4328; X86-NEXT: retl 4329; 4330; X64-LABEL: test_mm512_fmsubadd_round_ps: 4331; X64: # %bb.0: # %entry 4332; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2 4333; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 4334; X64-NEXT: retq 4335entry: 4336 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4337 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4338 ret <16 x float> %0 4339} 4340 4341define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4342; X86-LABEL: test_mm512_mask_fmsubadd_round_ps: 4343; X86: # %bb.0: # %entry 4344; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4345; X86-NEXT: kmovw %eax, %k1 4346; X86-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4347; X86-NEXT: retl 4348; 4349; X64-LABEL: test_mm512_mask_fmsubadd_round_ps: 4350; X64: # %bb.0: # %entry 4351; X64-NEXT: kmovw %edi, %k1 4352; X64-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4353; X64-NEXT: retq 4354entry: 4355 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4356 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4357 %1 = bitcast i16 %__U to <16 x i1> 4358 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4359 ret <16 x float> %2 4360} 4361 4362define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4363; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps: 4364; X86: # %bb.0: # %entry 4365; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4366; X86-NEXT: kmovw %eax, %k1 4367; X86-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4368; X86-NEXT: retl 4369; 4370; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps: 4371; X64: # %bb.0: # %entry 4372; X64-NEXT: kmovw %edi, %k1 4373; X64-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4374; X64-NEXT: retq 4375entry: 4376 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4377 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4378 %1 = bitcast i16 %__U to <16 x i1> 4379 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 4380 ret <16 x float> %2 4381} 4382 4383define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4384; CHECK-LABEL: test_mm512_fmaddsub_ps: 4385; CHECK: # %bb.0: # %entry 4386; CHECK-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4387; CHECK-NEXT: ret{{[l|q]}} 4388entry: 4389 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4390 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4391 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4392 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4393 ret <16 x float> %3 4394} 4395 4396define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4397; X86-LABEL: test_mm512_mask_fmaddsub_ps: 4398; X86: # %bb.0: # %entry 4399; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4400; X86-NEXT: kmovw %eax, %k1 4401; X86-NEXT: vfmaddsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2 4402; X86-NEXT: retl 4403; 4404; X64-LABEL: test_mm512_mask_fmaddsub_ps: 4405; X64: # %bb.0: # %entry 4406; X64-NEXT: kmovw %edi, %k1 4407; X64-NEXT: vfmaddsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2 4408; X64-NEXT: retq 4409entry: 4410 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4411 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4412 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4413 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4414 %4 = bitcast i16 %__U to <16 x i1> 4415 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A 4416 ret <16 x float> %5 4417} 4418 4419define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4420; X86-LABEL: test_mm512_mask3_fmaddsub_ps: 4421; X86: # %bb.0: # %entry 4422; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4423; X86-NEXT: kmovw %eax, %k1 4424; X86-NEXT: vfmaddsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2 4425; X86-NEXT: vmovaps %zmm2, %zmm0 4426; X86-NEXT: retl 4427; 4428; X64-LABEL: test_mm512_mask3_fmaddsub_ps: 4429; X64: # %bb.0: # %entry 4430; X64-NEXT: kmovw %edi, %k1 4431; X64-NEXT: vfmaddsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2 4432; X64-NEXT: vmovaps %zmm2, %zmm0 4433; X64-NEXT: retq 4434entry: 4435 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4436 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4437 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4438 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4439 %4 = bitcast i16 %__U to <16 x i1> 4440 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C 4441 ret <16 x float> %5 4442} 4443 4444define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4445; X86-LABEL: test_mm512_maskz_fmaddsub_ps: 4446; X86: # %bb.0: # %entry 4447; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4448; X86-NEXT: kmovw %eax, %k1 4449; X86-NEXT: vfmaddsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2 4450; X86-NEXT: retl 4451; 4452; X64-LABEL: test_mm512_maskz_fmaddsub_ps: 4453; X64: # %bb.0: # %entry 4454; X64-NEXT: kmovw %edi, %k1 4455; X64-NEXT: vfmaddsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2 4456; X64-NEXT: retq 4457entry: 4458 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4459 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4460 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4461 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4462 %4 = bitcast i16 %__U to <16 x i1> 4463 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer 4464 ret <16 x float> %5 4465} 4466 4467define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4468; CHECK-LABEL: test_mm512_fmsubadd_ps: 4469; CHECK: # %bb.0: # %entry 4470; CHECK-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4471; CHECK-NEXT: ret{{[l|q]}} 4472entry: 4473 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4474 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4475 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4476 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4477 ret <16 x float> %2 4478} 4479 4480define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4481; X86-LABEL: test_mm512_mask_fmsubadd_ps: 4482; X86: # %bb.0: # %entry 4483; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4484; X86-NEXT: kmovw %eax, %k1 4485; X86-NEXT: vfmsubadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2 4486; X86-NEXT: retl 4487; 4488; X64-LABEL: test_mm512_mask_fmsubadd_ps: 4489; X64: # %bb.0: # %entry 4490; X64-NEXT: kmovw %edi, %k1 4491; X64-NEXT: vfmsubadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2 4492; X64-NEXT: retq 4493entry: 4494 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4495 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4496 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4497 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4498 %3 = bitcast i16 %__U to <16 x i1> 4499 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A 4500 ret <16 x float> %4 4501} 4502 4503define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4504; X86-LABEL: test_mm512_maskz_fmsubadd_ps: 4505; X86: # %bb.0: # %entry 4506; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4507; X86-NEXT: kmovw %eax, %k1 4508; X86-NEXT: vfmsubadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2 4509; X86-NEXT: retl 4510; 4511; X64-LABEL: test_mm512_maskz_fmsubadd_ps: 4512; X64: # %bb.0: # %entry 4513; X64-NEXT: kmovw %edi, %k1 4514; X64-NEXT: vfmsubadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2 4515; X64-NEXT: retq 4516entry: 4517 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4518 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4519 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4520 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4521 %3 = bitcast i16 %__U to <16 x i1> 4522 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 4523 ret <16 x float> %4 4524} 4525 4526define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4527; X86-LABEL: test_mm512_mask3_fmsub_round_pd: 4528; X86: # %bb.0: # %entry 4529; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4530; X86-NEXT: kmovw %eax, %k1 4531; X86-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4532; X86-NEXT: vmovapd %zmm2, %zmm0 4533; X86-NEXT: retl 4534; 4535; X64-LABEL: test_mm512_mask3_fmsub_round_pd: 4536; X64: # %bb.0: # %entry 4537; X64-NEXT: kmovw %edi, %k1 4538; X64-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4539; X64-NEXT: vmovapd %zmm2, %zmm0 4540; X64-NEXT: retq 4541entry: 4542 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4543 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4544 %1 = bitcast i8 %__U to <8 x i1> 4545 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4546 ret <8 x double> %2 4547} 4548 4549define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4550; X86-LABEL: test_mm512_mask3_fmsub_pd: 4551; X86: # %bb.0: # %entry 4552; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4553; X86-NEXT: kmovw %eax, %k1 4554; X86-NEXT: vfmsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2 4555; X86-NEXT: vmovapd %zmm2, %zmm0 4556; X86-NEXT: retl 4557; 4558; X64-LABEL: test_mm512_mask3_fmsub_pd: 4559; X64: # %bb.0: # %entry 4560; X64-NEXT: kmovw %edi, %k1 4561; X64-NEXT: vfmsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2 4562; X64-NEXT: vmovapd %zmm2, %zmm0 4563; X64-NEXT: retq 4564entry: 4565 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4566 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4567 %1 = bitcast i8 %__U to <8 x i1> 4568 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4569 ret <8 x double> %2 4570} 4571 4572define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4573; X86-LABEL: test_mm512_mask3_fmsub_round_ps: 4574; X86: # %bb.0: # %entry 4575; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4576; X86-NEXT: kmovw %eax, %k1 4577; X86-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4578; X86-NEXT: vmovaps %zmm2, %zmm0 4579; X86-NEXT: retl 4580; 4581; X64-LABEL: test_mm512_mask3_fmsub_round_ps: 4582; X64: # %bb.0: # %entry 4583; X64-NEXT: kmovw %edi, %k1 4584; X64-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4585; X64-NEXT: vmovaps %zmm2, %zmm0 4586; X64-NEXT: retq 4587entry: 4588 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4589 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4590 %1 = bitcast i16 %__U to <16 x i1> 4591 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4592 ret <16 x float> %2 4593} 4594 4595define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4596; X86-LABEL: test_mm512_mask3_fmsub_ps: 4597; X86: # %bb.0: # %entry 4598; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4599; X86-NEXT: kmovw %eax, %k1 4600; X86-NEXT: vfmsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2 4601; X86-NEXT: vmovaps %zmm2, %zmm0 4602; X86-NEXT: retl 4603; 4604; X64-LABEL: test_mm512_mask3_fmsub_ps: 4605; X64: # %bb.0: # %entry 4606; X64-NEXT: kmovw %edi, %k1 4607; X64-NEXT: vfmsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2 4608; X64-NEXT: vmovaps %zmm2, %zmm0 4609; X64-NEXT: retq 4610entry: 4611 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4612 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4613 %1 = bitcast i16 %__U to <16 x i1> 4614 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4615 ret <16 x float> %2 4616} 4617 4618define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4619; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd: 4620; X86: # %bb.0: # %entry 4621; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4622; X86-NEXT: kmovw %eax, %k1 4623; X86-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4624; X86-NEXT: vmovapd %zmm2, %zmm0 4625; X86-NEXT: retl 4626; 4627; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd: 4628; X64: # %bb.0: # %entry 4629; X64-NEXT: kmovw %edi, %k1 4630; X64-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4631; X64-NEXT: vmovapd %zmm2, %zmm0 4632; X64-NEXT: retq 4633entry: 4634 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4635 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4636 %1 = bitcast i8 %__U to <8 x i1> 4637 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4638 ret <8 x double> %2 4639} 4640 4641define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4642; X86-LABEL: test_mm512_mask3_fmsubadd_pd: 4643; X86: # %bb.0: # %entry 4644; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4645; X86-NEXT: kmovw %eax, %k1 4646; X86-NEXT: vfmsubadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2 4647; X86-NEXT: vmovapd %zmm2, %zmm0 4648; X86-NEXT: retl 4649; 4650; X64-LABEL: test_mm512_mask3_fmsubadd_pd: 4651; X64: # %bb.0: # %entry 4652; X64-NEXT: kmovw %edi, %k1 4653; X64-NEXT: vfmsubadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2 4654; X64-NEXT: vmovapd %zmm2, %zmm0 4655; X64-NEXT: retq 4656entry: 4657 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4658 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4659 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4660 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4661 %3 = bitcast i8 %__U to <8 x i1> 4662 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C 4663 ret <8 x double> %4 4664} 4665 4666define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4667; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps: 4668; X86: # %bb.0: # %entry 4669; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4670; X86-NEXT: kmovw %eax, %k1 4671; X86-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4672; X86-NEXT: vmovaps %zmm2, %zmm0 4673; X86-NEXT: retl 4674; 4675; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps: 4676; X64: # %bb.0: # %entry 4677; X64-NEXT: kmovw %edi, %k1 4678; X64-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4679; X64-NEXT: vmovaps %zmm2, %zmm0 4680; X64-NEXT: retq 4681entry: 4682 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4683 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4684 %1 = bitcast i16 %__U to <16 x i1> 4685 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4686 ret <16 x float> %2 4687} 4688 4689define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4690; X86-LABEL: test_mm512_mask3_fmsubadd_ps: 4691; X86: # %bb.0: # %entry 4692; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4693; X86-NEXT: kmovw %eax, %k1 4694; X86-NEXT: vfmsubadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2 4695; X86-NEXT: vmovaps %zmm2, %zmm0 4696; X86-NEXT: retl 4697; 4698; X64-LABEL: test_mm512_mask3_fmsubadd_ps: 4699; X64: # %bb.0: # %entry 4700; X64-NEXT: kmovw %edi, %k1 4701; X64-NEXT: vfmsubadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2 4702; X64-NEXT: vmovaps %zmm2, %zmm0 4703; X64-NEXT: retq 4704entry: 4705 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4706 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4707 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4708 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4709 %3 = bitcast i16 %__U to <16 x i1> 4710 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C 4711 ret <16 x float> %4 4712} 4713 4714define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4715; X86-LABEL: test_mm512_mask_fnmadd_round_pd: 4716; X86: # %bb.0: # %entry 4717; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4718; X86-NEXT: kmovw %eax, %k1 4719; X86-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4720; X86-NEXT: retl 4721; 4722; X64-LABEL: test_mm512_mask_fnmadd_round_pd: 4723; X64: # %bb.0: # %entry 4724; X64-NEXT: kmovw %edi, %k1 4725; X64-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4726; X64-NEXT: retq 4727entry: 4728 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4729 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 4730 %1 = bitcast i8 %__U to <8 x i1> 4731 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4732 ret <8 x double> %2 4733} 4734 4735define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4736; X86-LABEL: test_mm512_mask_fnmadd_pd: 4737; X86: # %bb.0: # %entry 4738; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4739; X86-NEXT: kmovw %eax, %k1 4740; X86-NEXT: vfnmadd132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2 4741; X86-NEXT: retl 4742; 4743; X64-LABEL: test_mm512_mask_fnmadd_pd: 4744; X64: # %bb.0: # %entry 4745; X64-NEXT: kmovw %edi, %k1 4746; X64-NEXT: vfnmadd132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2 4747; X64-NEXT: retq 4748entry: 4749 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4750 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 4751 %1 = bitcast i8 %__U to <8 x i1> 4752 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4753 ret <8 x double> %2 4754} 4755 4756define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4757; X86-LABEL: test_mm512_mask_fnmadd_round_ps: 4758; X86: # %bb.0: # %entry 4759; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4760; X86-NEXT: kmovw %eax, %k1 4761; X86-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4762; X86-NEXT: retl 4763; 4764; X64-LABEL: test_mm512_mask_fnmadd_round_ps: 4765; X64: # %bb.0: # %entry 4766; X64-NEXT: kmovw %edi, %k1 4767; X64-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4768; X64-NEXT: retq 4769entry: 4770 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4771 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 4772 %1 = bitcast i16 %__U to <16 x i1> 4773 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4774 ret <16 x float> %2 4775} 4776 4777define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4778; X86-LABEL: test_mm512_mask_fnmadd_ps: 4779; X86: # %bb.0: # %entry 4780; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4781; X86-NEXT: kmovw %eax, %k1 4782; X86-NEXT: vfnmadd132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2 4783; X86-NEXT: retl 4784; 4785; X64-LABEL: test_mm512_mask_fnmadd_ps: 4786; X64: # %bb.0: # %entry 4787; X64-NEXT: kmovw %edi, %k1 4788; X64-NEXT: vfnmadd132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2 4789; X64-NEXT: retq 4790entry: 4791 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4792 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 4793 %1 = bitcast i16 %__U to <16 x i1> 4794 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4795 ret <16 x float> %2 4796} 4797 4798define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4799; X86-LABEL: test_mm512_mask_fnmsub_round_pd: 4800; X86: # %bb.0: # %entry 4801; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4802; X86-NEXT: kmovw %eax, %k1 4803; X86-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4804; X86-NEXT: retl 4805; 4806; X64-LABEL: test_mm512_mask_fnmsub_round_pd: 4807; X64: # %bb.0: # %entry 4808; X64-NEXT: kmovw %edi, %k1 4809; X64-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4810; X64-NEXT: retq 4811entry: 4812 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4813 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4814 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8) 4815 %1 = bitcast i8 %__U to <8 x i1> 4816 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4817 ret <8 x double> %2 4818} 4819 4820define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4821; X86-LABEL: test_mm512_mask3_fnmsub_round_pd: 4822; X86: # %bb.0: # %entry 4823; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4824; X86-NEXT: kmovw %eax, %k1 4825; X86-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4826; X86-NEXT: vmovapd %zmm2, %zmm0 4827; X86-NEXT: retl 4828; 4829; X64-LABEL: test_mm512_mask3_fnmsub_round_pd: 4830; X64: # %bb.0: # %entry 4831; X64-NEXT: kmovw %edi, %k1 4832; X64-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4833; X64-NEXT: vmovapd %zmm2, %zmm0 4834; X64-NEXT: retq 4835entry: 4836 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4837 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4838 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8) 4839 %1 = bitcast i8 %__U to <8 x i1> 4840 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4841 ret <8 x double> %2 4842} 4843 4844define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4845; X86-LABEL: test_mm512_mask_fnmsub_pd: 4846; X86: # %bb.0: # %entry 4847; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4848; X86-NEXT: kmovw %eax, %k1 4849; X86-NEXT: vfnmsub132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2 4850; X86-NEXT: retl 4851; 4852; X64-LABEL: test_mm512_mask_fnmsub_pd: 4853; X64: # %bb.0: # %entry 4854; X64-NEXT: kmovw %edi, %k1 4855; X64-NEXT: vfnmsub132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2 4856; X64-NEXT: retq 4857entry: 4858 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4859 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4860 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10 4861 %1 = bitcast i8 %__U to <8 x i1> 4862 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4863 ret <8 x double> %2 4864} 4865 4866define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4867; X86-LABEL: test_mm512_mask3_fnmsub_pd: 4868; X86: # %bb.0: # %entry 4869; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4870; X86-NEXT: kmovw %eax, %k1 4871; X86-NEXT: vfnmsub231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2 4872; X86-NEXT: vmovapd %zmm2, %zmm0 4873; X86-NEXT: retl 4874; 4875; X64-LABEL: test_mm512_mask3_fnmsub_pd: 4876; X64: # %bb.0: # %entry 4877; X64-NEXT: kmovw %edi, %k1 4878; X64-NEXT: vfnmsub231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2 4879; X64-NEXT: vmovapd %zmm2, %zmm0 4880; X64-NEXT: retq 4881entry: 4882 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4883 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4884 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10 4885 %1 = bitcast i8 %__U to <8 x i1> 4886 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4887 ret <8 x double> %2 4888} 4889 4890define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4891; X86-LABEL: test_mm512_mask_fnmsub_round_ps: 4892; X86: # %bb.0: # %entry 4893; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4894; X86-NEXT: kmovw %eax, %k1 4895; X86-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4896; X86-NEXT: retl 4897; 4898; X64-LABEL: test_mm512_mask_fnmsub_round_ps: 4899; X64: # %bb.0: # %entry 4900; X64-NEXT: kmovw %edi, %k1 4901; X64-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4902; X64-NEXT: retq 4903entry: 4904 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4905 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4906 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8) 4907 %1 = bitcast i16 %__U to <16 x i1> 4908 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4909 ret <16 x float> %2 4910} 4911 4912define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4913; X86-LABEL: test_mm512_mask3_fnmsub_round_ps: 4914; X86: # %bb.0: # %entry 4915; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4916; X86-NEXT: kmovw %eax, %k1 4917; X86-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4918; X86-NEXT: vmovaps %zmm2, %zmm0 4919; X86-NEXT: retl 4920; 4921; X64-LABEL: test_mm512_mask3_fnmsub_round_ps: 4922; X64: # %bb.0: # %entry 4923; X64-NEXT: kmovw %edi, %k1 4924; X64-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4925; X64-NEXT: vmovaps %zmm2, %zmm0 4926; X64-NEXT: retq 4927entry: 4928 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4929 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4930 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8) 4931 %1 = bitcast i16 %__U to <16 x i1> 4932 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4933 ret <16 x float> %2 4934} 4935 4936define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4937; X86-LABEL: test_mm512_mask_fnmsub_ps: 4938; X86: # %bb.0: # %entry 4939; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4940; X86-NEXT: kmovw %eax, %k1 4941; X86-NEXT: vfnmsub132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2 4942; X86-NEXT: retl 4943; 4944; X64-LABEL: test_mm512_mask_fnmsub_ps: 4945; X64: # %bb.0: # %entry 4946; X64-NEXT: kmovw %edi, %k1 4947; X64-NEXT: vfnmsub132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2 4948; X64-NEXT: retq 4949entry: 4950 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4951 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4952 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10 4953 %1 = bitcast i16 %__U to <16 x i1> 4954 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4955 ret <16 x float> %2 4956} 4957 4958define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4959; X86-LABEL: test_mm512_mask3_fnmsub_ps: 4960; X86: # %bb.0: # %entry 4961; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 4962; X86-NEXT: kmovw %eax, %k1 4963; X86-NEXT: vfnmsub231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2 4964; X86-NEXT: vmovaps %zmm2, %zmm0 4965; X86-NEXT: retl 4966; 4967; X64-LABEL: test_mm512_mask3_fnmsub_ps: 4968; X64: # %bb.0: # %entry 4969; X64-NEXT: kmovw %edi, %k1 4970; X64-NEXT: vfnmsub231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2 4971; X64-NEXT: vmovaps %zmm2, %zmm0 4972; X64-NEXT: retq 4973entry: 4974 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4975 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4976 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10 4977 %1 = bitcast i16 %__U to <16 x i1> 4978 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4979 ret <16 x float> %2 4980} 4981 4982define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 4983; X86-LABEL: test_mm_mask_fmadd_ss: 4984; X86: # %bb.0: # %entry 4985; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4986; X86-NEXT: kmovw %eax, %k1 4987; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2 4988; X86-NEXT: retl 4989; 4990; X64-LABEL: test_mm_mask_fmadd_ss: 4991; X64: # %bb.0: # %entry 4992; X64-NEXT: kmovw %edi, %k1 4993; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2 4994; X64-NEXT: retq 4995entry: 4996 %0 = extractelement <4 x float> %__W, i64 0 4997 %1 = extractelement <4 x float> %__A, i64 0 4998 %2 = extractelement <4 x float> %__B, i64 0 4999 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5000 %4 = and i8 %__U, 1 5001 %tobool.i = icmp eq i8 %4, 0 5002 %vecext1.i = extractelement <4 x float> %__W, i32 0 5003 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5004 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 5005 ret <4 x float> %vecins.i 5006} 5007 5008define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5009; X86-LABEL: test_mm_mask_fmadd_round_ss: 5010; X86: # %bb.0: # %entry 5011; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5012; X86-NEXT: kmovw %eax, %k1 5013; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5014; X86-NEXT: retl 5015; 5016; X64-LABEL: test_mm_mask_fmadd_round_ss: 5017; X64: # %bb.0: # %entry 5018; X64-NEXT: kmovw %edi, %k1 5019; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5020; X64-NEXT: retq 5021entry: 5022 %0 = extractelement <4 x float> %__W, i64 0 5023 %1 = extractelement <4 x float> %__A, i64 0 5024 %2 = extractelement <4 x float> %__B, i64 0 5025 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5026 %4 = bitcast i8 %__U to <8 x i1> 5027 %5 = extractelement <8 x i1> %4, i64 0 5028 %6 = select i1 %5, float %3, float %0 5029 %7 = insertelement <4 x float> %__W, float %6, i64 0 5030 ret <4 x float> %7 5031} 5032 5033declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1 5034 5035define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5036; X86-LABEL: test_mm_maskz_fmadd_ss: 5037; X86: # %bb.0: # %entry 5038; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5039; X86-NEXT: kmovw %eax, %k1 5040; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 5041; X86-NEXT: retl 5042; 5043; X64-LABEL: test_mm_maskz_fmadd_ss: 5044; X64: # %bb.0: # %entry 5045; X64-NEXT: kmovw %edi, %k1 5046; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 5047; X64-NEXT: retq 5048entry: 5049 %0 = extractelement <4 x float> %__A, i64 0 5050 %1 = extractelement <4 x float> %__B, i64 0 5051 %2 = extractelement <4 x float> %__C, i64 0 5052 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5053 %4 = and i8 %__U, 1 5054 %tobool.i = icmp eq i8 %4, 0 5055 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 5056 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 5057 ret <4 x float> %vecins.i 5058} 5059 5060define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5061; X86-LABEL: test_mm_maskz_fmadd_round_ss: 5062; X86: # %bb.0: # %entry 5063; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5064; X86-NEXT: kmovw %eax, %k1 5065; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5066; X86-NEXT: retl 5067; 5068; X64-LABEL: test_mm_maskz_fmadd_round_ss: 5069; X64: # %bb.0: # %entry 5070; X64-NEXT: kmovw %edi, %k1 5071; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5072; X64-NEXT: retq 5073entry: 5074 %0 = extractelement <4 x float> %__A, i64 0 5075 %1 = extractelement <4 x float> %__B, i64 0 5076 %2 = extractelement <4 x float> %__C, i64 0 5077 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5078 %4 = bitcast i8 %__U to <8 x i1> 5079 %5 = extractelement <8 x i1> %4, i64 0 5080 %6 = select i1 %5, float %3, float 0.000000e+00 5081 %7 = insertelement <4 x float> %__A, float %6, i64 0 5082 ret <4 x float> %7 5083} 5084 5085define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5086; X86-LABEL: test_mm_mask3_fmadd_ss: 5087; X86: # %bb.0: # %entry 5088; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5089; X86-NEXT: kmovw %eax, %k1 5090; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2 5091; X86-NEXT: vmovaps %xmm2, %xmm0 5092; X86-NEXT: retl 5093; 5094; X64-LABEL: test_mm_mask3_fmadd_ss: 5095; X64: # %bb.0: # %entry 5096; X64-NEXT: kmovw %edi, %k1 5097; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2 5098; X64-NEXT: vmovaps %xmm2, %xmm0 5099; X64-NEXT: retq 5100entry: 5101 %0 = extractelement <4 x float> %__W, i64 0 5102 %1 = extractelement <4 x float> %__X, i64 0 5103 %2 = extractelement <4 x float> %__Y, i64 0 5104 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5105 %4 = and i8 %__U, 1 5106 %tobool.i = icmp eq i8 %4, 0 5107 %vecext1.i = extractelement <4 x float> %__Y, i32 0 5108 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5109 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 5110 ret <4 x float> %vecins.i 5111} 5112 5113define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5114; X86-LABEL: test_mm_mask3_fmadd_round_ss: 5115; X86: # %bb.0: # %entry 5116; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5117; X86-NEXT: kmovw %eax, %k1 5118; X86-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5119; X86-NEXT: vmovaps %xmm2, %xmm0 5120; X86-NEXT: retl 5121; 5122; X64-LABEL: test_mm_mask3_fmadd_round_ss: 5123; X64: # %bb.0: # %entry 5124; X64-NEXT: kmovw %edi, %k1 5125; X64-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5126; X64-NEXT: vmovaps %xmm2, %xmm0 5127; X64-NEXT: retq 5128entry: 5129 %0 = extractelement <4 x float> %__W, i64 0 5130 %1 = extractelement <4 x float> %__X, i64 0 5131 %2 = extractelement <4 x float> %__Y, i64 0 5132 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5133 %4 = bitcast i8 %__U to <8 x i1> 5134 %5 = extractelement <8 x i1> %4, i64 0 5135 %6 = select i1 %5, float %3, float %2 5136 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5137 ret <4 x float> %7 5138} 5139 5140define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5141; X86-LABEL: test_mm_mask_fmsub_ss: 5142; X86: # %bb.0: # %entry 5143; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5144; X86-NEXT: kmovw %eax, %k1 5145; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2 5146; X86-NEXT: retl 5147; 5148; X64-LABEL: test_mm_mask_fmsub_ss: 5149; X64: # %bb.0: # %entry 5150; X64-NEXT: kmovw %edi, %k1 5151; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2 5152; X64-NEXT: retq 5153entry: 5154 %0 = extractelement <4 x float> %__W, i64 0 5155 %1 = extractelement <4 x float> %__A, i64 0 5156 %.rhs.i = extractelement <4 x float> %__B, i64 0 5157 %2 = fsub float -0.000000e+00, %.rhs.i 5158 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5159 %4 = and i8 %__U, 1 5160 %tobool.i = icmp eq i8 %4, 0 5161 %vecext1.i = extractelement <4 x float> %__W, i32 0 5162 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5163 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 5164 ret <4 x float> %vecins.i 5165} 5166 5167define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5168; X86-LABEL: test_mm_mask_fmsub_round_ss: 5169; X86: # %bb.0: # %entry 5170; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5171; X86-NEXT: kmovw %eax, %k1 5172; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5173; X86-NEXT: retl 5174; 5175; X64-LABEL: test_mm_mask_fmsub_round_ss: 5176; X64: # %bb.0: # %entry 5177; X64-NEXT: kmovw %edi, %k1 5178; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5179; X64-NEXT: retq 5180entry: 5181 %0 = extractelement <4 x float> %__W, i64 0 5182 %1 = extractelement <4 x float> %__A, i64 0 5183 %.rhs = extractelement <4 x float> %__B, i64 0 5184 %2 = fsub float -0.000000e+00, %.rhs 5185 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5186 %4 = bitcast i8 %__U to <8 x i1> 5187 %5 = extractelement <8 x i1> %4, i64 0 5188 %6 = select i1 %5, float %3, float %0 5189 %7 = insertelement <4 x float> %__W, float %6, i64 0 5190 ret <4 x float> %7 5191} 5192 5193define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5194; X86-LABEL: test_mm_maskz_fmsub_ss: 5195; X86: # %bb.0: # %entry 5196; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5197; X86-NEXT: kmovw %eax, %k1 5198; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 5199; X86-NEXT: retl 5200; 5201; X64-LABEL: test_mm_maskz_fmsub_ss: 5202; X64: # %bb.0: # %entry 5203; X64-NEXT: kmovw %edi, %k1 5204; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 5205; X64-NEXT: retq 5206entry: 5207 %0 = extractelement <4 x float> %__A, i64 0 5208 %1 = extractelement <4 x float> %__B, i64 0 5209 %.rhs.i = extractelement <4 x float> %__C, i64 0 5210 %2 = fsub float -0.000000e+00, %.rhs.i 5211 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5212 %4 = and i8 %__U, 1 5213 %tobool.i = icmp eq i8 %4, 0 5214 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 5215 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 5216 ret <4 x float> %vecins.i 5217} 5218 5219define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5220; X86-LABEL: test_mm_maskz_fmsub_round_ss: 5221; X86: # %bb.0: # %entry 5222; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5223; X86-NEXT: kmovw %eax, %k1 5224; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5225; X86-NEXT: retl 5226; 5227; X64-LABEL: test_mm_maskz_fmsub_round_ss: 5228; X64: # %bb.0: # %entry 5229; X64-NEXT: kmovw %edi, %k1 5230; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5231; X64-NEXT: retq 5232entry: 5233 %0 = extractelement <4 x float> %__A, i64 0 5234 %1 = extractelement <4 x float> %__B, i64 0 5235 %.rhs = extractelement <4 x float> %__C, i64 0 5236 %2 = fsub float -0.000000e+00, %.rhs 5237 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5238 %4 = bitcast i8 %__U to <8 x i1> 5239 %5 = extractelement <8 x i1> %4, i64 0 5240 %6 = select i1 %5, float %3, float 0.000000e+00 5241 %7 = insertelement <4 x float> %__A, float %6, i64 0 5242 ret <4 x float> %7 5243} 5244 5245define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5246; X86-LABEL: test_mm_mask3_fmsub_ss: 5247; X86: # %bb.0: # %entry 5248; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5249; X86-NEXT: kmovw %eax, %k1 5250; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2 5251; X86-NEXT: vmovaps %xmm2, %xmm0 5252; X86-NEXT: retl 5253; 5254; X64-LABEL: test_mm_mask3_fmsub_ss: 5255; X64: # %bb.0: # %entry 5256; X64-NEXT: kmovw %edi, %k1 5257; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2 5258; X64-NEXT: vmovaps %xmm2, %xmm0 5259; X64-NEXT: retq 5260entry: 5261 %0 = extractelement <4 x float> %__W, i64 0 5262 %1 = extractelement <4 x float> %__X, i64 0 5263 %.rhs.i = extractelement <4 x float> %__Y, i64 0 5264 %2 = fsub float -0.000000e+00, %.rhs.i 5265 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5266 %4 = and i8 %__U, 1 5267 %tobool.i = icmp eq i8 %4, 0 5268 %vecext1.i = extractelement <4 x float> %__Y, i32 0 5269 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5270 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 5271 ret <4 x float> %vecins.i 5272} 5273 5274define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5275; X86-LABEL: test_mm_mask3_fmsub_round_ss: 5276; X86: # %bb.0: # %entry 5277; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5278; X86-NEXT: kmovw %eax, %k1 5279; X86-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5280; X86-NEXT: vmovaps %xmm2, %xmm0 5281; X86-NEXT: retl 5282; 5283; X64-LABEL: test_mm_mask3_fmsub_round_ss: 5284; X64: # %bb.0: # %entry 5285; X64-NEXT: kmovw %edi, %k1 5286; X64-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5287; X64-NEXT: vmovaps %xmm2, %xmm0 5288; X64-NEXT: retq 5289entry: 5290 %0 = extractelement <4 x float> %__W, i64 0 5291 %1 = extractelement <4 x float> %__X, i64 0 5292 %.rhs = extractelement <4 x float> %__Y, i64 0 5293 %2 = fsub float -0.000000e+00, %.rhs 5294 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5295 %4 = bitcast i8 %__U to <8 x i1> 5296 %5 = extractelement <8 x i1> %4, i64 0 5297 %6 = select i1 %5, float %3, float %.rhs 5298 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5299 ret <4 x float> %7 5300} 5301 5302define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5303; X86-LABEL: test_mm_mask_fnmadd_ss: 5304; X86: # %bb.0: # %entry 5305; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5306; X86-NEXT: kmovw %eax, %k1 5307; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2 5308; X86-NEXT: retl 5309; 5310; X64-LABEL: test_mm_mask_fnmadd_ss: 5311; X64: # %bb.0: # %entry 5312; X64-NEXT: kmovw %edi, %k1 5313; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2 5314; X64-NEXT: retq 5315entry: 5316 %0 = extractelement <4 x float> %__W, i64 0 5317 %.rhs.i = extractelement <4 x float> %__A, i64 0 5318 %1 = fsub float -0.000000e+00, %.rhs.i 5319 %2 = extractelement <4 x float> %__B, i64 0 5320 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5321 %4 = and i8 %__U, 1 5322 %tobool.i = icmp eq i8 %4, 0 5323 %vecext1.i = extractelement <4 x float> %__W, i32 0 5324 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5325 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 5326 ret <4 x float> %vecins.i 5327} 5328 5329define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5330; X86-LABEL: test_mm_mask_fnmadd_round_ss: 5331; X86: # %bb.0: # %entry 5332; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5333; X86-NEXT: kmovw %eax, %k1 5334; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5335; X86-NEXT: retl 5336; 5337; X64-LABEL: test_mm_mask_fnmadd_round_ss: 5338; X64: # %bb.0: # %entry 5339; X64-NEXT: kmovw %edi, %k1 5340; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5341; X64-NEXT: retq 5342entry: 5343 %0 = extractelement <4 x float> %__W, i64 0 5344 %.rhs = extractelement <4 x float> %__A, i64 0 5345 %1 = fsub float -0.000000e+00, %.rhs 5346 %2 = extractelement <4 x float> %__B, i64 0 5347 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5348 %4 = bitcast i8 %__U to <8 x i1> 5349 %5 = extractelement <8 x i1> %4, i64 0 5350 %6 = select i1 %5, float %3, float %0 5351 %7 = insertelement <4 x float> %__W, float %6, i64 0 5352 ret <4 x float> %7 5353} 5354 5355define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5356; X86-LABEL: test_mm_maskz_fnmadd_ss: 5357; X86: # %bb.0: # %entry 5358; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5359; X86-NEXT: kmovw %eax, %k1 5360; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 5361; X86-NEXT: retl 5362; 5363; X64-LABEL: test_mm_maskz_fnmadd_ss: 5364; X64: # %bb.0: # %entry 5365; X64-NEXT: kmovw %edi, %k1 5366; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 5367; X64-NEXT: retq 5368entry: 5369 %0 = extractelement <4 x float> %__A, i64 0 5370 %.rhs.i = extractelement <4 x float> %__B, i64 0 5371 %1 = fsub float -0.000000e+00, %.rhs.i 5372 %2 = extractelement <4 x float> %__C, i64 0 5373 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5374 %4 = and i8 %__U, 1 5375 %tobool.i = icmp eq i8 %4, 0 5376 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 5377 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 5378 ret <4 x float> %vecins.i 5379} 5380 5381define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5382; X86-LABEL: test_mm_maskz_fnmadd_round_ss: 5383; X86: # %bb.0: # %entry 5384; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5385; X86-NEXT: kmovw %eax, %k1 5386; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5387; X86-NEXT: retl 5388; 5389; X64-LABEL: test_mm_maskz_fnmadd_round_ss: 5390; X64: # %bb.0: # %entry 5391; X64-NEXT: kmovw %edi, %k1 5392; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5393; X64-NEXT: retq 5394entry: 5395 %0 = extractelement <4 x float> %__A, i64 0 5396 %.rhs = extractelement <4 x float> %__B, i64 0 5397 %1 = fsub float -0.000000e+00, %.rhs 5398 %2 = extractelement <4 x float> %__C, i64 0 5399 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5400 %4 = bitcast i8 %__U to <8 x i1> 5401 %5 = extractelement <8 x i1> %4, i64 0 5402 %6 = select i1 %5, float %3, float 0.000000e+00 5403 %7 = insertelement <4 x float> %__A, float %6, i64 0 5404 ret <4 x float> %7 5405} 5406 5407define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5408; X86-LABEL: test_mm_mask3_fnmadd_ss: 5409; X86: # %bb.0: # %entry 5410; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5411; X86-NEXT: kmovw %eax, %k1 5412; X86-NEXT: vfnmadd231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 5413; X86-NEXT: vmovaps %xmm2, %xmm0 5414; X86-NEXT: retl 5415; 5416; X64-LABEL: test_mm_mask3_fnmadd_ss: 5417; X64: # %bb.0: # %entry 5418; X64-NEXT: kmovw %edi, %k1 5419; X64-NEXT: vfnmadd231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 5420; X64-NEXT: vmovaps %xmm2, %xmm0 5421; X64-NEXT: retq 5422entry: 5423 %0 = extractelement <4 x float> %__W, i64 0 5424 %.rhs.i = extractelement <4 x float> %__X, i64 0 5425 %1 = fsub float -0.000000e+00, %.rhs.i 5426 %2 = extractelement <4 x float> %__Y, i64 0 5427 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5428 %4 = and i8 %__U, 1 5429 %tobool.i = icmp eq i8 %4, 0 5430 %vecext1.i = extractelement <4 x float> %__Y, i32 0 5431 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5432 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 5433 ret <4 x float> %vecins.i 5434} 5435 5436define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5437; X86-LABEL: test_mm_mask3_fnmadd_round_ss: 5438; X86: # %bb.0: # %entry 5439; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5440; X86-NEXT: kmovw %eax, %k1 5441; X86-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5442; X86-NEXT: vmovaps %xmm2, %xmm0 5443; X86-NEXT: retl 5444; 5445; X64-LABEL: test_mm_mask3_fnmadd_round_ss: 5446; X64: # %bb.0: # %entry 5447; X64-NEXT: kmovw %edi, %k1 5448; X64-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5449; X64-NEXT: vmovaps %xmm2, %xmm0 5450; X64-NEXT: retq 5451entry: 5452 %0 = extractelement <4 x float> %__W, i64 0 5453 %.rhs = extractelement <4 x float> %__X, i64 0 5454 %1 = fsub float -0.000000e+00, %.rhs 5455 %2 = extractelement <4 x float> %__Y, i64 0 5456 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5457 %4 = bitcast i8 %__U to <8 x i1> 5458 %5 = extractelement <8 x i1> %4, i64 0 5459 %6 = select i1 %5, float %3, float %2 5460 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5461 ret <4 x float> %7 5462} 5463 5464define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5465; X86-LABEL: test_mm_mask_fnmsub_ss: 5466; X86: # %bb.0: # %entry 5467; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5468; X86-NEXT: kmovw %eax, %k1 5469; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2 5470; X86-NEXT: retl 5471; 5472; X64-LABEL: test_mm_mask_fnmsub_ss: 5473; X64: # %bb.0: # %entry 5474; X64-NEXT: kmovw %edi, %k1 5475; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2 5476; X64-NEXT: retq 5477entry: 5478 %0 = extractelement <4 x float> %__W, i64 0 5479 %.rhs.i = extractelement <4 x float> %__A, i64 0 5480 %1 = fsub float -0.000000e+00, %.rhs.i 5481 %.rhs7.i = extractelement <4 x float> %__B, i64 0 5482 %2 = fsub float -0.000000e+00, %.rhs7.i 5483 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5484 %4 = and i8 %__U, 1 5485 %tobool.i = icmp eq i8 %4, 0 5486 %vecext2.i = extractelement <4 x float> %__W, i32 0 5487 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3 5488 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 5489 ret <4 x float> %vecins.i 5490} 5491 5492define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5493; X86-LABEL: test_mm_mask_fnmsub_round_ss: 5494; X86: # %bb.0: # %entry 5495; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5496; X86-NEXT: kmovw %eax, %k1 5497; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5498; X86-NEXT: retl 5499; 5500; X64-LABEL: test_mm_mask_fnmsub_round_ss: 5501; X64: # %bb.0: # %entry 5502; X64-NEXT: kmovw %edi, %k1 5503; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5504; X64-NEXT: retq 5505entry: 5506 %0 = extractelement <4 x float> %__W, i64 0 5507 %.rhs = extractelement <4 x float> %__A, i64 0 5508 %1 = fsub float -0.000000e+00, %.rhs 5509 %.rhs2 = extractelement <4 x float> %__B, i64 0 5510 %2 = fsub float -0.000000e+00, %.rhs2 5511 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5512 %4 = bitcast i8 %__U to <8 x i1> 5513 %5 = extractelement <8 x i1> %4, i64 0 5514 %6 = select i1 %5, float %3, float %0 5515 %7 = insertelement <4 x float> %__W, float %6, i64 0 5516 ret <4 x float> %7 5517} 5518 5519define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5520; X86-LABEL: test_mm_maskz_fnmsub_ss: 5521; X86: # %bb.0: # %entry 5522; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5523; X86-NEXT: kmovw %eax, %k1 5524; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 5525; X86-NEXT: retl 5526; 5527; X64-LABEL: test_mm_maskz_fnmsub_ss: 5528; X64: # %bb.0: # %entry 5529; X64-NEXT: kmovw %edi, %k1 5530; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 5531; X64-NEXT: retq 5532entry: 5533 %0 = extractelement <4 x float> %__A, i64 0 5534 %.rhs.i = extractelement <4 x float> %__B, i64 0 5535 %1 = fsub float -0.000000e+00, %.rhs.i 5536 %.rhs5.i = extractelement <4 x float> %__C, i64 0 5537 %2 = fsub float -0.000000e+00, %.rhs5.i 5538 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5539 %4 = and i8 %__U, 1 5540 %tobool.i = icmp eq i8 %4, 0 5541 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 5542 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 5543 ret <4 x float> %vecins.i 5544} 5545 5546define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5547; X86-LABEL: test_mm_maskz_fnmsub_round_ss: 5548; X86: # %bb.0: # %entry 5549; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5550; X86-NEXT: kmovw %eax, %k1 5551; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5552; X86-NEXT: retl 5553; 5554; X64-LABEL: test_mm_maskz_fnmsub_round_ss: 5555; X64: # %bb.0: # %entry 5556; X64-NEXT: kmovw %edi, %k1 5557; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5558; X64-NEXT: retq 5559entry: 5560 %0 = extractelement <4 x float> %__A, i64 0 5561 %.rhs = extractelement <4 x float> %__B, i64 0 5562 %1 = fsub float -0.000000e+00, %.rhs 5563 %.rhs2 = extractelement <4 x float> %__C, i64 0 5564 %2 = fsub float -0.000000e+00, %.rhs2 5565 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5566 %4 = bitcast i8 %__U to <8 x i1> 5567 %5 = extractelement <8 x i1> %4, i64 0 5568 %6 = select i1 %5, float %3, float 0.000000e+00 5569 %7 = insertelement <4 x float> %__A, float %6, i64 0 5570 ret <4 x float> %7 5571} 5572 5573define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5574; X86-LABEL: test_mm_mask3_fnmsub_ss: 5575; X86: # %bb.0: # %entry 5576; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5577; X86-NEXT: kmovw %eax, %k1 5578; X86-NEXT: vfnmsub231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 5579; X86-NEXT: vmovaps %xmm2, %xmm0 5580; X86-NEXT: retl 5581; 5582; X64-LABEL: test_mm_mask3_fnmsub_ss: 5583; X64: # %bb.0: # %entry 5584; X64-NEXT: kmovw %edi, %k1 5585; X64-NEXT: vfnmsub231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 5586; X64-NEXT: vmovaps %xmm2, %xmm0 5587; X64-NEXT: retq 5588entry: 5589 %0 = extractelement <4 x float> %__W, i64 0 5590 %.rhs.i = extractelement <4 x float> %__X, i64 0 5591 %1 = fsub float -0.000000e+00, %.rhs.i 5592 %.rhs7.i = extractelement <4 x float> %__Y, i64 0 5593 %2 = fsub float -0.000000e+00, %.rhs7.i 5594 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5595 %4 = and i8 %__U, 1 5596 %tobool.i = icmp eq i8 %4, 0 5597 %vecext2.i = extractelement <4 x float> %__Y, i32 0 5598 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3 5599 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 5600 ret <4 x float> %vecins.i 5601} 5602 5603define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5604; X86-LABEL: test_mm_mask3_fnmsub_round_ss: 5605; X86: # %bb.0: # %entry 5606; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5607; X86-NEXT: kmovw %eax, %k1 5608; X86-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5609; X86-NEXT: vmovaps %xmm2, %xmm0 5610; X86-NEXT: retl 5611; 5612; X64-LABEL: test_mm_mask3_fnmsub_round_ss: 5613; X64: # %bb.0: # %entry 5614; X64-NEXT: kmovw %edi, %k1 5615; X64-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5616; X64-NEXT: vmovaps %xmm2, %xmm0 5617; X64-NEXT: retq 5618entry: 5619 %0 = extractelement <4 x float> %__W, i64 0 5620 %.rhs = extractelement <4 x float> %__X, i64 0 5621 %1 = fsub float -0.000000e+00, %.rhs 5622 %.rhs1 = extractelement <4 x float> %__Y, i64 0 5623 %2 = fsub float -0.000000e+00, %.rhs1 5624 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5625 %4 = bitcast i8 %__U to <8 x i1> 5626 %5 = extractelement <8 x i1> %4, i64 0 5627 %6 = select i1 %5, float %3, float %.rhs1 5628 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5629 ret <4 x float> %7 5630} 5631 5632define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5633; X86-LABEL: test_mm_mask_fmadd_sd: 5634; X86: # %bb.0: # %entry 5635; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5636; X86-NEXT: kmovw %eax, %k1 5637; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2 5638; X86-NEXT: retl 5639; 5640; X64-LABEL: test_mm_mask_fmadd_sd: 5641; X64: # %bb.0: # %entry 5642; X64-NEXT: kmovw %edi, %k1 5643; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2 5644; X64-NEXT: retq 5645entry: 5646 %0 = extractelement <2 x double> %__W, i64 0 5647 %1 = extractelement <2 x double> %__A, i64 0 5648 %2 = extractelement <2 x double> %__B, i64 0 5649 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5650 %4 = and i8 %__U, 1 5651 %tobool.i = icmp eq i8 %4, 0 5652 %vecext1.i = extractelement <2 x double> %__W, i32 0 5653 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5654 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 5655 ret <2 x double> %vecins.i 5656} 5657 5658define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5659; X86-LABEL: test_mm_mask_fmadd_round_sd: 5660; X86: # %bb.0: # %entry 5661; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5662; X86-NEXT: kmovw %eax, %k1 5663; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5664; X86-NEXT: retl 5665; 5666; X64-LABEL: test_mm_mask_fmadd_round_sd: 5667; X64: # %bb.0: # %entry 5668; X64-NEXT: kmovw %edi, %k1 5669; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5670; X64-NEXT: retq 5671entry: 5672 %0 = extractelement <2 x double> %__W, i64 0 5673 %1 = extractelement <2 x double> %__A, i64 0 5674 %2 = extractelement <2 x double> %__B, i64 0 5675 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5676 %4 = bitcast i8 %__U to <8 x i1> 5677 %5 = extractelement <8 x i1> %4, i64 0 5678 %6 = select i1 %5, double %3, double %0 5679 %7 = insertelement <2 x double> %__W, double %6, i64 0 5680 ret <2 x double> %7 5681} 5682 5683declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1 5684 5685define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5686; X86-LABEL: test_mm_maskz_fmadd_sd: 5687; X86: # %bb.0: # %entry 5688; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5689; X86-NEXT: kmovw %eax, %k1 5690; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 5691; X86-NEXT: retl 5692; 5693; X64-LABEL: test_mm_maskz_fmadd_sd: 5694; X64: # %bb.0: # %entry 5695; X64-NEXT: kmovw %edi, %k1 5696; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 5697; X64-NEXT: retq 5698entry: 5699 %0 = extractelement <2 x double> %__A, i64 0 5700 %1 = extractelement <2 x double> %__B, i64 0 5701 %2 = extractelement <2 x double> %__C, i64 0 5702 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5703 %4 = and i8 %__U, 1 5704 %tobool.i = icmp eq i8 %4, 0 5705 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 5706 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 5707 ret <2 x double> %vecins.i 5708} 5709 5710define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5711; X86-LABEL: test_mm_maskz_fmadd_round_sd: 5712; X86: # %bb.0: # %entry 5713; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5714; X86-NEXT: kmovw %eax, %k1 5715; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5716; X86-NEXT: retl 5717; 5718; X64-LABEL: test_mm_maskz_fmadd_round_sd: 5719; X64: # %bb.0: # %entry 5720; X64-NEXT: kmovw %edi, %k1 5721; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5722; X64-NEXT: retq 5723entry: 5724 %0 = extractelement <2 x double> %__A, i64 0 5725 %1 = extractelement <2 x double> %__B, i64 0 5726 %2 = extractelement <2 x double> %__C, i64 0 5727 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5728 %4 = bitcast i8 %__U to <8 x i1> 5729 %5 = extractelement <8 x i1> %4, i64 0 5730 %6 = select i1 %5, double %3, double 0.000000e+00 5731 %7 = insertelement <2 x double> %__A, double %6, i64 0 5732 ret <2 x double> %7 5733} 5734 5735define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5736; X86-LABEL: test_mm_mask3_fmadd_sd: 5737; X86: # %bb.0: # %entry 5738; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5739; X86-NEXT: kmovw %eax, %k1 5740; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2 5741; X86-NEXT: vmovapd %xmm2, %xmm0 5742; X86-NEXT: retl 5743; 5744; X64-LABEL: test_mm_mask3_fmadd_sd: 5745; X64: # %bb.0: # %entry 5746; X64-NEXT: kmovw %edi, %k1 5747; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2 5748; X64-NEXT: vmovapd %xmm2, %xmm0 5749; X64-NEXT: retq 5750entry: 5751 %0 = extractelement <2 x double> %__W, i64 0 5752 %1 = extractelement <2 x double> %__X, i64 0 5753 %2 = extractelement <2 x double> %__Y, i64 0 5754 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5755 %4 = and i8 %__U, 1 5756 %tobool.i = icmp eq i8 %4, 0 5757 %vecext1.i = extractelement <2 x double> %__Y, i32 0 5758 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5759 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 5760 ret <2 x double> %vecins.i 5761} 5762 5763define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5764; X86-LABEL: test_mm_mask3_fmadd_round_sd: 5765; X86: # %bb.0: # %entry 5766; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5767; X86-NEXT: kmovw %eax, %k1 5768; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5769; X86-NEXT: vmovapd %xmm2, %xmm0 5770; X86-NEXT: retl 5771; 5772; X64-LABEL: test_mm_mask3_fmadd_round_sd: 5773; X64: # %bb.0: # %entry 5774; X64-NEXT: kmovw %edi, %k1 5775; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5776; X64-NEXT: vmovapd %xmm2, %xmm0 5777; X64-NEXT: retq 5778entry: 5779 %0 = extractelement <2 x double> %__W, i64 0 5780 %1 = extractelement <2 x double> %__X, i64 0 5781 %2 = extractelement <2 x double> %__Y, i64 0 5782 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5783 %4 = bitcast i8 %__U to <8 x i1> 5784 %5 = extractelement <8 x i1> %4, i64 0 5785 %6 = select i1 %5, double %3, double %2 5786 %7 = insertelement <2 x double> %__Y, double %6, i64 0 5787 ret <2 x double> %7 5788} 5789 5790define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5791; X86-LABEL: test_mm_mask_fmsub_sd: 5792; X86: # %bb.0: # %entry 5793; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5794; X86-NEXT: kmovw %eax, %k1 5795; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2 5796; X86-NEXT: retl 5797; 5798; X64-LABEL: test_mm_mask_fmsub_sd: 5799; X64: # %bb.0: # %entry 5800; X64-NEXT: kmovw %edi, %k1 5801; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2 5802; X64-NEXT: retq 5803entry: 5804 %0 = extractelement <2 x double> %__W, i64 0 5805 %1 = extractelement <2 x double> %__A, i64 0 5806 %.rhs.i = extractelement <2 x double> %__B, i64 0 5807 %2 = fsub double -0.000000e+00, %.rhs.i 5808 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5809 %4 = and i8 %__U, 1 5810 %tobool.i = icmp eq i8 %4, 0 5811 %vecext1.i = extractelement <2 x double> %__W, i32 0 5812 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5813 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 5814 ret <2 x double> %vecins.i 5815} 5816 5817define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5818; X86-LABEL: test_mm_mask_fmsub_round_sd: 5819; X86: # %bb.0: # %entry 5820; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5821; X86-NEXT: kmovw %eax, %k1 5822; X86-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5823; X86-NEXT: retl 5824; 5825; X64-LABEL: test_mm_mask_fmsub_round_sd: 5826; X64: # %bb.0: # %entry 5827; X64-NEXT: kmovw %edi, %k1 5828; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5829; X64-NEXT: retq 5830entry: 5831 %0 = extractelement <2 x double> %__W, i64 0 5832 %1 = extractelement <2 x double> %__A, i64 0 5833 %.rhs = extractelement <2 x double> %__B, i64 0 5834 %2 = fsub double -0.000000e+00, %.rhs 5835 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5836 %4 = bitcast i8 %__U to <8 x i1> 5837 %5 = extractelement <8 x i1> %4, i64 0 5838 %6 = select i1 %5, double %3, double %0 5839 %7 = insertelement <2 x double> %__W, double %6, i64 0 5840 ret <2 x double> %7 5841} 5842 5843define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5844; X86-LABEL: test_mm_maskz_fmsub_sd: 5845; X86: # %bb.0: # %entry 5846; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5847; X86-NEXT: kmovw %eax, %k1 5848; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 5849; X86-NEXT: retl 5850; 5851; X64-LABEL: test_mm_maskz_fmsub_sd: 5852; X64: # %bb.0: # %entry 5853; X64-NEXT: kmovw %edi, %k1 5854; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 5855; X64-NEXT: retq 5856entry: 5857 %0 = extractelement <2 x double> %__A, i64 0 5858 %1 = extractelement <2 x double> %__B, i64 0 5859 %.rhs.i = extractelement <2 x double> %__C, i64 0 5860 %2 = fsub double -0.000000e+00, %.rhs.i 5861 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5862 %4 = and i8 %__U, 1 5863 %tobool.i = icmp eq i8 %4, 0 5864 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 5865 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 5866 ret <2 x double> %vecins.i 5867} 5868 5869define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5870; X86-LABEL: test_mm_maskz_fmsub_round_sd: 5871; X86: # %bb.0: # %entry 5872; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5873; X86-NEXT: kmovw %eax, %k1 5874; X86-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5875; X86-NEXT: retl 5876; 5877; X64-LABEL: test_mm_maskz_fmsub_round_sd: 5878; X64: # %bb.0: # %entry 5879; X64-NEXT: kmovw %edi, %k1 5880; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5881; X64-NEXT: retq 5882entry: 5883 %0 = extractelement <2 x double> %__A, i64 0 5884 %1 = extractelement <2 x double> %__B, i64 0 5885 %.rhs = extractelement <2 x double> %__C, i64 0 5886 %2 = fsub double -0.000000e+00, %.rhs 5887 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5888 %4 = bitcast i8 %__U to <8 x i1> 5889 %5 = extractelement <8 x i1> %4, i64 0 5890 %6 = select i1 %5, double %3, double 0.000000e+00 5891 %7 = insertelement <2 x double> %__A, double %6, i64 0 5892 ret <2 x double> %7 5893} 5894 5895define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5896; X86-LABEL: test_mm_mask3_fmsub_sd: 5897; X86: # %bb.0: # %entry 5898; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5899; X86-NEXT: kmovw %eax, %k1 5900; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2 5901; X86-NEXT: vmovapd %xmm2, %xmm0 5902; X86-NEXT: retl 5903; 5904; X64-LABEL: test_mm_mask3_fmsub_sd: 5905; X64: # %bb.0: # %entry 5906; X64-NEXT: kmovw %edi, %k1 5907; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2 5908; X64-NEXT: vmovapd %xmm2, %xmm0 5909; X64-NEXT: retq 5910entry: 5911 %0 = extractelement <2 x double> %__W, i64 0 5912 %1 = extractelement <2 x double> %__X, i64 0 5913 %.rhs.i = extractelement <2 x double> %__Y, i64 0 5914 %2 = fsub double -0.000000e+00, %.rhs.i 5915 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5916 %4 = and i8 %__U, 1 5917 %tobool.i = icmp eq i8 %4, 0 5918 %vecext1.i = extractelement <2 x double> %__Y, i32 0 5919 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5920 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 5921 ret <2 x double> %vecins.i 5922} 5923 5924define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5925; X86-LABEL: test_mm_mask3_fmsub_round_sd: 5926; X86: # %bb.0: # %entry 5927; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5928; X86-NEXT: kmovw %eax, %k1 5929; X86-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5930; X86-NEXT: vmovapd %xmm2, %xmm0 5931; X86-NEXT: retl 5932; 5933; X64-LABEL: test_mm_mask3_fmsub_round_sd: 5934; X64: # %bb.0: # %entry 5935; X64-NEXT: kmovw %edi, %k1 5936; X64-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5937; X64-NEXT: vmovapd %xmm2, %xmm0 5938; X64-NEXT: retq 5939entry: 5940 %0 = extractelement <2 x double> %__W, i64 0 5941 %1 = extractelement <2 x double> %__X, i64 0 5942 %.rhs = extractelement <2 x double> %__Y, i64 0 5943 %2 = fsub double -0.000000e+00, %.rhs 5944 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5945 %4 = bitcast i8 %__U to <8 x i1> 5946 %5 = extractelement <8 x i1> %4, i64 0 5947 %6 = select i1 %5, double %3, double %.rhs 5948 %7 = insertelement <2 x double> %__Y, double %6, i64 0 5949 ret <2 x double> %7 5950} 5951 5952define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5953; X86-LABEL: test_mm_mask_fnmadd_sd: 5954; X86: # %bb.0: # %entry 5955; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5956; X86-NEXT: kmovw %eax, %k1 5957; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2 5958; X86-NEXT: retl 5959; 5960; X64-LABEL: test_mm_mask_fnmadd_sd: 5961; X64: # %bb.0: # %entry 5962; X64-NEXT: kmovw %edi, %k1 5963; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2 5964; X64-NEXT: retq 5965entry: 5966 %0 = extractelement <2 x double> %__W, i64 0 5967 %.rhs.i = extractelement <2 x double> %__A, i64 0 5968 %1 = fsub double -0.000000e+00, %.rhs.i 5969 %2 = extractelement <2 x double> %__B, i64 0 5970 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5971 %4 = and i8 %__U, 1 5972 %tobool.i = icmp eq i8 %4, 0 5973 %vecext1.i = extractelement <2 x double> %__W, i32 0 5974 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5975 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 5976 ret <2 x double> %vecins.i 5977} 5978 5979define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5980; X86-LABEL: test_mm_mask_fnmadd_round_sd: 5981; X86: # %bb.0: # %entry 5982; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5983; X86-NEXT: kmovw %eax, %k1 5984; X86-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5985; X86-NEXT: retl 5986; 5987; X64-LABEL: test_mm_mask_fnmadd_round_sd: 5988; X64: # %bb.0: # %entry 5989; X64-NEXT: kmovw %edi, %k1 5990; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5991; X64-NEXT: retq 5992entry: 5993 %0 = extractelement <2 x double> %__W, i64 0 5994 %.rhs = extractelement <2 x double> %__A, i64 0 5995 %1 = fsub double -0.000000e+00, %.rhs 5996 %2 = extractelement <2 x double> %__B, i64 0 5997 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5998 %4 = bitcast i8 %__U to <8 x i1> 5999 %5 = extractelement <8 x i1> %4, i64 0 6000 %6 = select i1 %5, double %3, double %0 6001 %7 = insertelement <2 x double> %__W, double %6, i64 0 6002 ret <2 x double> %7 6003} 6004 6005define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 6006; X86-LABEL: test_mm_maskz_fnmadd_sd: 6007; X86: # %bb.0: # %entry 6008; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6009; X86-NEXT: kmovw %eax, %k1 6010; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 6011; X86-NEXT: retl 6012; 6013; X64-LABEL: test_mm_maskz_fnmadd_sd: 6014; X64: # %bb.0: # %entry 6015; X64-NEXT: kmovw %edi, %k1 6016; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 6017; X64-NEXT: retq 6018entry: 6019 %0 = extractelement <2 x double> %__A, i64 0 6020 %.rhs.i = extractelement <2 x double> %__B, i64 0 6021 %1 = fsub double -0.000000e+00, %.rhs.i 6022 %2 = extractelement <2 x double> %__C, i64 0 6023 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6024 %4 = and i8 %__U, 1 6025 %tobool.i = icmp eq i8 %4, 0 6026 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 6027 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 6028 ret <2 x double> %vecins.i 6029} 6030 6031define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 6032; X86-LABEL: test_mm_maskz_fnmadd_round_sd: 6033; X86: # %bb.0: # %entry 6034; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6035; X86-NEXT: kmovw %eax, %k1 6036; X86-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 6037; X86-NEXT: retl 6038; 6039; X64-LABEL: test_mm_maskz_fnmadd_round_sd: 6040; X64: # %bb.0: # %entry 6041; X64-NEXT: kmovw %edi, %k1 6042; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 6043; X64-NEXT: retq 6044entry: 6045 %0 = extractelement <2 x double> %__A, i64 0 6046 %.rhs = extractelement <2 x double> %__B, i64 0 6047 %1 = fsub double -0.000000e+00, %.rhs 6048 %2 = extractelement <2 x double> %__C, i64 0 6049 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6050 %4 = bitcast i8 %__U to <8 x i1> 6051 %5 = extractelement <8 x i1> %4, i64 0 6052 %6 = select i1 %5, double %3, double 0.000000e+00 6053 %7 = insertelement <2 x double> %__A, double %6, i64 0 6054 ret <2 x double> %7 6055} 6056 6057define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 6058; X86-LABEL: test_mm_mask3_fnmadd_sd: 6059; X86: # %bb.0: # %entry 6060; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6061; X86-NEXT: kmovw %eax, %k1 6062; X86-NEXT: vfnmadd231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 6063; X86-NEXT: vmovapd %xmm2, %xmm0 6064; X86-NEXT: retl 6065; 6066; X64-LABEL: test_mm_mask3_fnmadd_sd: 6067; X64: # %bb.0: # %entry 6068; X64-NEXT: kmovw %edi, %k1 6069; X64-NEXT: vfnmadd231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 6070; X64-NEXT: vmovapd %xmm2, %xmm0 6071; X64-NEXT: retq 6072entry: 6073 %0 = extractelement <2 x double> %__W, i64 0 6074 %.rhs.i = extractelement <2 x double> %__X, i64 0 6075 %1 = fsub double -0.000000e+00, %.rhs.i 6076 %2 = extractelement <2 x double> %__Y, i64 0 6077 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6078 %4 = and i8 %__U, 1 6079 %tobool.i = icmp eq i8 %4, 0 6080 %vecext1.i = extractelement <2 x double> %__Y, i32 0 6081 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 6082 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 6083 ret <2 x double> %vecins.i 6084} 6085 6086define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 6087; X86-LABEL: test_mm_mask3_fnmadd_round_sd: 6088; X86: # %bb.0: # %entry 6089; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6090; X86-NEXT: kmovw %eax, %k1 6091; X86-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 6092; X86-NEXT: vmovapd %xmm2, %xmm0 6093; X86-NEXT: retl 6094; 6095; X64-LABEL: test_mm_mask3_fnmadd_round_sd: 6096; X64: # %bb.0: # %entry 6097; X64-NEXT: kmovw %edi, %k1 6098; X64-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 6099; X64-NEXT: vmovapd %xmm2, %xmm0 6100; X64-NEXT: retq 6101entry: 6102 %0 = extractelement <2 x double> %__W, i64 0 6103 %.rhs = extractelement <2 x double> %__X, i64 0 6104 %1 = fsub double -0.000000e+00, %.rhs 6105 %2 = extractelement <2 x double> %__Y, i64 0 6106 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6107 %4 = bitcast i8 %__U to <8 x i1> 6108 %5 = extractelement <8 x i1> %4, i64 0 6109 %6 = select i1 %5, double %3, double %2 6110 %7 = insertelement <2 x double> %__Y, double %6, i64 0 6111 ret <2 x double> %7 6112} 6113 6114define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 6115; X86-LABEL: test_mm_mask_fnmsub_sd: 6116; X86: # %bb.0: # %entry 6117; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6118; X86-NEXT: kmovw %eax, %k1 6119; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2 6120; X86-NEXT: retl 6121; 6122; X64-LABEL: test_mm_mask_fnmsub_sd: 6123; X64: # %bb.0: # %entry 6124; X64-NEXT: kmovw %edi, %k1 6125; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2 6126; X64-NEXT: retq 6127entry: 6128 %0 = extractelement <2 x double> %__W, i64 0 6129 %.rhs.i = extractelement <2 x double> %__A, i64 0 6130 %1 = fsub double -0.000000e+00, %.rhs.i 6131 %.rhs7.i = extractelement <2 x double> %__B, i64 0 6132 %2 = fsub double -0.000000e+00, %.rhs7.i 6133 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6134 %4 = and i8 %__U, 1 6135 %tobool.i = icmp eq i8 %4, 0 6136 %vecext2.i = extractelement <2 x double> %__W, i32 0 6137 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3 6138 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 6139 ret <2 x double> %vecins.i 6140} 6141 6142define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 6143; X86-LABEL: test_mm_mask_fnmsub_round_sd: 6144; X86: # %bb.0: # %entry 6145; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6146; X86-NEXT: kmovw %eax, %k1 6147; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 6148; X86-NEXT: retl 6149; 6150; X64-LABEL: test_mm_mask_fnmsub_round_sd: 6151; X64: # %bb.0: # %entry 6152; X64-NEXT: kmovw %edi, %k1 6153; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 6154; X64-NEXT: retq 6155entry: 6156 %0 = extractelement <2 x double> %__W, i64 0 6157 %.rhs = extractelement <2 x double> %__A, i64 0 6158 %1 = fsub double -0.000000e+00, %.rhs 6159 %.rhs2 = extractelement <2 x double> %__B, i64 0 6160 %2 = fsub double -0.000000e+00, %.rhs2 6161 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6162 %4 = bitcast i8 %__U to <8 x i1> 6163 %5 = extractelement <8 x i1> %4, i64 0 6164 %6 = select i1 %5, double %3, double %0 6165 %7 = insertelement <2 x double> %__W, double %6, i64 0 6166 ret <2 x double> %7 6167} 6168 6169define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 6170; X86-LABEL: test_mm_maskz_fnmsub_sd: 6171; X86: # %bb.0: # %entry 6172; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6173; X86-NEXT: kmovw %eax, %k1 6174; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 6175; X86-NEXT: retl 6176; 6177; X64-LABEL: test_mm_maskz_fnmsub_sd: 6178; X64: # %bb.0: # %entry 6179; X64-NEXT: kmovw %edi, %k1 6180; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 6181; X64-NEXT: retq 6182entry: 6183 %0 = extractelement <2 x double> %__A, i64 0 6184 %.rhs.i = extractelement <2 x double> %__B, i64 0 6185 %1 = fsub double -0.000000e+00, %.rhs.i 6186 %.rhs5.i = extractelement <2 x double> %__C, i64 0 6187 %2 = fsub double -0.000000e+00, %.rhs5.i 6188 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6189 %4 = and i8 %__U, 1 6190 %tobool.i = icmp eq i8 %4, 0 6191 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 6192 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 6193 ret <2 x double> %vecins.i 6194} 6195 6196define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 6197; X86-LABEL: test_mm_maskz_fnmsub_round_sd: 6198; X86: # %bb.0: # %entry 6199; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6200; X86-NEXT: kmovw %eax, %k1 6201; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 6202; X86-NEXT: retl 6203; 6204; X64-LABEL: test_mm_maskz_fnmsub_round_sd: 6205; X64: # %bb.0: # %entry 6206; X64-NEXT: kmovw %edi, %k1 6207; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 6208; X64-NEXT: retq 6209entry: 6210 %0 = extractelement <2 x double> %__A, i64 0 6211 %.rhs = extractelement <2 x double> %__B, i64 0 6212 %1 = fsub double -0.000000e+00, %.rhs 6213 %.rhs2 = extractelement <2 x double> %__C, i64 0 6214 %2 = fsub double -0.000000e+00, %.rhs2 6215 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6216 %4 = bitcast i8 %__U to <8 x i1> 6217 %5 = extractelement <8 x i1> %4, i64 0 6218 %6 = select i1 %5, double %3, double 0.000000e+00 6219 %7 = insertelement <2 x double> %__A, double %6, i64 0 6220 ret <2 x double> %7 6221} 6222 6223define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 6224; X86-LABEL: test_mm_mask3_fnmsub_sd: 6225; X86: # %bb.0: # %entry 6226; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6227; X86-NEXT: kmovw %eax, %k1 6228; X86-NEXT: vfnmsub231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 6229; X86-NEXT: vmovapd %xmm2, %xmm0 6230; X86-NEXT: retl 6231; 6232; X64-LABEL: test_mm_mask3_fnmsub_sd: 6233; X64: # %bb.0: # %entry 6234; X64-NEXT: kmovw %edi, %k1 6235; X64-NEXT: vfnmsub231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 6236; X64-NEXT: vmovapd %xmm2, %xmm0 6237; X64-NEXT: retq 6238entry: 6239 %0 = extractelement <2 x double> %__W, i64 0 6240 %.rhs.i = extractelement <2 x double> %__X, i64 0 6241 %1 = fsub double -0.000000e+00, %.rhs.i 6242 %.rhs7.i = extractelement <2 x double> %__Y, i64 0 6243 %2 = fsub double -0.000000e+00, %.rhs7.i 6244 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6245 %4 = and i8 %__U, 1 6246 %tobool.i = icmp eq i8 %4, 0 6247 %vecext2.i = extractelement <2 x double> %__Y, i32 0 6248 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3 6249 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 6250 ret <2 x double> %vecins.i 6251} 6252 6253define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 6254; X86-LABEL: test_mm_mask3_fnmsub_round_sd: 6255; X86: # %bb.0: # %entry 6256; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6257; X86-NEXT: kmovw %eax, %k1 6258; X86-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 6259; X86-NEXT: vmovapd %xmm2, %xmm0 6260; X86-NEXT: retl 6261; 6262; X64-LABEL: test_mm_mask3_fnmsub_round_sd: 6263; X64: # %bb.0: # %entry 6264; X64-NEXT: kmovw %edi, %k1 6265; X64-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 6266; X64-NEXT: vmovapd %xmm2, %xmm0 6267; X64-NEXT: retq 6268entry: 6269 %0 = extractelement <2 x double> %__W, i64 0 6270 %.rhs = extractelement <2 x double> %__X, i64 0 6271 %1 = fsub double -0.000000e+00, %.rhs 6272 %.rhs1 = extractelement <2 x double> %__Y, i64 0 6273 %2 = fsub double -0.000000e+00, %.rhs1 6274 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6275 %4 = bitcast i8 %__U to <8 x i1> 6276 %5 = extractelement <8 x i1> %4, i64 0 6277 %6 = select i1 %5, double %3, double %.rhs1 6278 %7 = insertelement <2 x double> %__Y, double %6, i64 0 6279 ret <2 x double> %7 6280} 6281 6282define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6283; X86-LABEL: test_mm512_mask_expandloadu_epi64: 6284; X86: # %bb.0: # %entry 6285; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6286; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6287; X86-NEXT: kmovw %ecx, %k1 6288; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} 6289; X86-NEXT: retl 6290; 6291; X64-LABEL: test_mm512_mask_expandloadu_epi64: 6292; X64: # %bb.0: # %entry 6293; X64-NEXT: kmovw %edi, %k1 6294; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} 6295; X64-NEXT: retq 6296entry: 6297 %0 = bitcast i8* %__P to i64* 6298 %1 = bitcast i8 %__U to <8 x i1> 6299 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W) 6300 ret <8 x i64> %2 6301} 6302 6303define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { 6304; X86-LABEL: test_mm512_maskz_expandloadu_epi64: 6305; X86: # %bb.0: # %entry 6306; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6307; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6308; X86-NEXT: kmovw %ecx, %k1 6309; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z} 6310; X86-NEXT: retl 6311; 6312; X64-LABEL: test_mm512_maskz_expandloadu_epi64: 6313; X64: # %bb.0: # %entry 6314; X64-NEXT: kmovw %edi, %k1 6315; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z} 6316; X64-NEXT: retq 6317entry: 6318 %0 = bitcast i8* %__P to i64* 6319 %1 = bitcast i8 %__U to <8 x i1> 6320 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer) 6321 ret <8 x i64> %2 6322} 6323 6324define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { 6325; X86-LABEL: test_mm512_mask_expandloadu_pd: 6326; X86: # %bb.0: # %entry 6327; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6328; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6329; X86-NEXT: kmovw %ecx, %k1 6330; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} 6331; X86-NEXT: retl 6332; 6333; X64-LABEL: test_mm512_mask_expandloadu_pd: 6334; X64: # %bb.0: # %entry 6335; X64-NEXT: kmovw %edi, %k1 6336; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} 6337; X64-NEXT: retq 6338entry: 6339 %0 = bitcast i8* %__P to double* 6340 %1 = bitcast i8 %__U to <8 x i1> 6341 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W) 6342 ret <8 x double> %2 6343} 6344 6345define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { 6346; X86-LABEL: test_mm512_maskz_expandloadu_pd: 6347; X86: # %bb.0: # %entry 6348; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6349; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6350; X86-NEXT: kmovw %ecx, %k1 6351; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z} 6352; X86-NEXT: retl 6353; 6354; X64-LABEL: test_mm512_maskz_expandloadu_pd: 6355; X64: # %bb.0: # %entry 6356; X64-NEXT: kmovw %edi, %k1 6357; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z} 6358; X64-NEXT: retq 6359entry: 6360 %0 = bitcast i8* %__P to double* 6361 %1 = bitcast i8 %__U to <8 x i1> 6362 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer) 6363 ret <8 x double> %2 6364} 6365 6366define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) { 6367; X86-LABEL: test_mm512_mask_expandloadu_epi32: 6368; X86: # %bb.0: # %entry 6369; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6370; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx 6371; X86-NEXT: kmovw %ecx, %k1 6372; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} 6373; X86-NEXT: retl 6374; 6375; X64-LABEL: test_mm512_mask_expandloadu_epi32: 6376; X64: # %bb.0: # %entry 6377; X64-NEXT: kmovw %edi, %k1 6378; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} 6379; X64-NEXT: retq 6380entry: 6381 %0 = bitcast <8 x i64> %__W to <16 x i32> 6382 %1 = bitcast i8* %__P to i32* 6383 %2 = bitcast i16 %__U to <16 x i1> 6384 %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11 6385 %4 = bitcast <16 x i32> %3 to <8 x i64> 6386 ret <8 x i64> %4 6387} 6388 6389define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) { 6390; X86-LABEL: test_mm512_maskz_expandloadu_epi32: 6391; X86: # %bb.0: # %entry 6392; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6393; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx 6394; X86-NEXT: kmovw %ecx, %k1 6395; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z} 6396; X86-NEXT: retl 6397; 6398; X64-LABEL: test_mm512_maskz_expandloadu_epi32: 6399; X64: # %bb.0: # %entry 6400; X64-NEXT: kmovw %edi, %k1 6401; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z} 6402; X64-NEXT: retq 6403entry: 6404 %0 = bitcast i8* %__P to i32* 6405 %1 = bitcast i16 %__U to <16 x i1> 6406 %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer) 6407 %3 = bitcast <16 x i32> %2 to <8 x i64> 6408 ret <8 x i64> %3 6409} 6410 6411define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) { 6412; X86-LABEL: test_mm512_mask_expandloadu_ps: 6413; X86: # %bb.0: # %entry 6414; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6415; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx 6416; X86-NEXT: kmovw %ecx, %k1 6417; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} 6418; X86-NEXT: retl 6419; 6420; X64-LABEL: test_mm512_mask_expandloadu_ps: 6421; X64: # %bb.0: # %entry 6422; X64-NEXT: kmovw %edi, %k1 6423; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} 6424; X64-NEXT: retq 6425entry: 6426 %0 = bitcast i8* %__P to float* 6427 %1 = bitcast i16 %__U to <16 x i1> 6428 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11 6429 ret <16 x float> %2 6430} 6431 6432define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) { 6433; X86-LABEL: test_mm512_maskz_expandloadu_ps: 6434; X86: # %bb.0: # %entry 6435; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6436; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx 6437; X86-NEXT: kmovw %ecx, %k1 6438; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z} 6439; X86-NEXT: retl 6440; 6441; X64-LABEL: test_mm512_maskz_expandloadu_ps: 6442; X64: # %bb.0: # %entry 6443; X64-NEXT: kmovw %edi, %k1 6444; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z} 6445; X64-NEXT: retq 6446entry: 6447 %0 = bitcast i8* %__P to float* 6448 %1 = bitcast i16 %__U to <16 x i1> 6449 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer) 6450 ret <16 x float> %2 6451} 6452 6453define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) { 6454; X86-LABEL: test_mm512_mask_compressstoreu_pd: 6455; X86: # %bb.0: # %entry 6456; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6457; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6458; X86-NEXT: kmovw %eax, %k1 6459; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1} 6460; X86-NEXT: vzeroupper 6461; X86-NEXT: retl 6462; 6463; X64-LABEL: test_mm512_mask_compressstoreu_pd: 6464; X64: # %bb.0: # %entry 6465; X64-NEXT: kmovw %esi, %k1 6466; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1} 6467; X64-NEXT: vzeroupper 6468; X64-NEXT: retq 6469entry: 6470 %0 = bitcast i8* %__P to double* 6471 %1 = bitcast i8 %__U to <8 x i1> 6472 tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1) 6473 ret void 6474} 6475 6476define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) { 6477; X86-LABEL: test_mm512_mask_compressstoreu_epi64: 6478; X86: # %bb.0: # %entry 6479; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6480; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6481; X86-NEXT: kmovw %eax, %k1 6482; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1} 6483; X86-NEXT: vzeroupper 6484; X86-NEXT: retl 6485; 6486; X64-LABEL: test_mm512_mask_compressstoreu_epi64: 6487; X64: # %bb.0: # %entry 6488; X64-NEXT: kmovw %esi, %k1 6489; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1} 6490; X64-NEXT: vzeroupper 6491; X64-NEXT: retq 6492entry: 6493 %0 = bitcast i8* %__P to i64* 6494 %1 = bitcast i8 %__U to <8 x i1> 6495 tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1) 6496 ret void 6497} 6498 6499define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) { 6500; X86-LABEL: test_mm512_mask_compressstoreu_ps: 6501; X86: # %bb.0: # %entry 6502; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 6503; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6504; X86-NEXT: kmovw %eax, %k1 6505; X86-NEXT: vcompressps %zmm0, (%ecx) {%k1} 6506; X86-NEXT: vzeroupper 6507; X86-NEXT: retl 6508; 6509; X64-LABEL: test_mm512_mask_compressstoreu_ps: 6510; X64: # %bb.0: # %entry 6511; X64-NEXT: kmovw %esi, %k1 6512; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1} 6513; X64-NEXT: vzeroupper 6514; X64-NEXT: retq 6515entry: 6516 %0 = bitcast i8* %__P to float* 6517 %1 = bitcast i16 %__U to <16 x i1> 6518 tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1) 6519 ret void 6520} 6521 6522define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) { 6523; X86-LABEL: test_mm512_mask_compressstoreu_epi32: 6524; X86: # %bb.0: # %entry 6525; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 6526; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6527; X86-NEXT: kmovw %eax, %k1 6528; X86-NEXT: vpcompressd %zmm0, (%ecx) {%k1} 6529; X86-NEXT: vzeroupper 6530; X86-NEXT: retl 6531; 6532; X64-LABEL: test_mm512_mask_compressstoreu_epi32: 6533; X64: # %bb.0: # %entry 6534; X64-NEXT: kmovw %esi, %k1 6535; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1} 6536; X64-NEXT: vzeroupper 6537; X64-NEXT: retq 6538entry: 6539 %0 = bitcast <8 x i64> %__A to <16 x i32> 6540 %1 = bitcast i8* %__P to i32* 6541 %2 = bitcast i16 %__U to <16 x i1> 6542 tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2) 6543 ret void 6544} 6545 6546define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) { 6547; X86-LABEL: test_mm512_reduce_add_epi64: 6548; X86: # %bb.0: # %entry 6549; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6550; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6551; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6552; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6553; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6554; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6555; X86-NEXT: vmovd %xmm0, %eax 6556; X86-NEXT: vpextrd $1, %xmm0, %edx 6557; X86-NEXT: vzeroupper 6558; X86-NEXT: retl 6559; 6560; X64-LABEL: test_mm512_reduce_add_epi64: 6561; X64: # %bb.0: # %entry 6562; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6563; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6564; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6565; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6566; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6567; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6568; X64-NEXT: vmovq %xmm0, %rax 6569; X64-NEXT: vzeroupper 6570; X64-NEXT: retq 6571entry: 6572 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6573 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6574 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i 6575 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6576 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6577 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i 6578 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6579 %add7.i = add <2 x i64> %shuffle6.i, %add4.i 6580 %vecext.i = extractelement <2 x i64> %add7.i, i32 0 6581 ret i64 %vecext.i 6582} 6583 6584define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { 6585; X86-LABEL: test_mm512_reduce_mul_epi64: 6586; X86: # %bb.0: # %entry 6587; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6588; X86-NEXT: vpsrlq $32, %ymm0, %ymm2 6589; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 6590; X86-NEXT: vpsrlq $32, %ymm1, %ymm3 6591; X86-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 6592; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6593; X86-NEXT: vpsllq $32, %ymm2, %ymm2 6594; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 6595; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6596; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6597; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 6598; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6599; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 6600; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6601; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6602; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6603; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6604; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6605; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6606; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6607; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 6608; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 6609; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 6610; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6611; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6612; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6613; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6614; X86-NEXT: vmovd %xmm0, %eax 6615; X86-NEXT: vpextrd $1, %xmm0, %edx 6616; X86-NEXT: vzeroupper 6617; X86-NEXT: retl 6618; 6619; X64-LABEL: test_mm512_reduce_mul_epi64: 6620; X64: # %bb.0: # %entry 6621; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6622; X64-NEXT: vpsrlq $32, %ymm0, %ymm2 6623; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 6624; X64-NEXT: vpsrlq $32, %ymm1, %ymm3 6625; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 6626; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6627; X64-NEXT: vpsllq $32, %ymm2, %ymm2 6628; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 6629; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6630; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6631; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 6632; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6633; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 6634; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6635; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6636; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6637; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6638; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6639; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6640; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 6641; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 6642; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 6643; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 6644; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6645; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6646; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6647; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6648; X64-NEXT: vmovq %xmm0, %rax 6649; X64-NEXT: vzeroupper 6650; X64-NEXT: retq 6651entry: 6652 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6653 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6654 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i 6655 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6656 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6657 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i 6658 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6659 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i 6660 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0 6661 ret i64 %vecext.i 6662} 6663 6664define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) { 6665; X86-LABEL: test_mm512_reduce_or_epi64: 6666; X86: # %bb.0: # %entry 6667; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6668; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 6669; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6670; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 6671; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6672; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 6673; X86-NEXT: vmovd %xmm0, %eax 6674; X86-NEXT: vpextrd $1, %xmm0, %edx 6675; X86-NEXT: vzeroupper 6676; X86-NEXT: retl 6677; 6678; X64-LABEL: test_mm512_reduce_or_epi64: 6679; X64: # %bb.0: # %entry 6680; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6681; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 6682; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6683; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 6684; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6685; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 6686; X64-NEXT: vmovq %xmm0, %rax 6687; X64-NEXT: vzeroupper 6688; X64-NEXT: retq 6689entry: 6690 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6691 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6692 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i 6693 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6694 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6695 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i 6696 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6697 %or7.i = or <2 x i64> %shuffle6.i, %or4.i 6698 %vecext.i = extractelement <2 x i64> %or7.i, i32 0 6699 ret i64 %vecext.i 6700} 6701 6702define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) { 6703; X86-LABEL: test_mm512_reduce_and_epi64: 6704; X86: # %bb.0: # %entry 6705; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6706; X86-NEXT: vpand %ymm1, %ymm0, %ymm0 6707; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6708; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 6709; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6710; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 6711; X86-NEXT: vmovd %xmm0, %eax 6712; X86-NEXT: vpextrd $1, %xmm0, %edx 6713; X86-NEXT: vzeroupper 6714; X86-NEXT: retl 6715; 6716; X64-LABEL: test_mm512_reduce_and_epi64: 6717; X64: # %bb.0: # %entry 6718; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6719; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 6720; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6721; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 6722; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6723; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 6724; X64-NEXT: vmovq %xmm0, %rax 6725; X64-NEXT: vzeroupper 6726; X64-NEXT: retq 6727entry: 6728 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6729 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6730 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i 6731 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6732 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6733 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i 6734 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6735 %and7.i = and <2 x i64> %shuffle6.i, %and4.i 6736 %vecext.i = extractelement <2 x i64> %and7.i, i32 0 6737 ret i64 %vecext.i 6738} 6739 6740define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6741; X86-LABEL: test_mm512_mask_reduce_add_epi64: 6742; X86: # %bb.0: # %entry 6743; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6744; X86-NEXT: kmovw %eax, %k1 6745; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6746; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6747; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6748; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6749; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6750; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6751; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6752; X86-NEXT: vmovd %xmm0, %eax 6753; X86-NEXT: vpextrd $1, %xmm0, %edx 6754; X86-NEXT: vzeroupper 6755; X86-NEXT: retl 6756; 6757; X64-LABEL: test_mm512_mask_reduce_add_epi64: 6758; X64: # %bb.0: # %entry 6759; X64-NEXT: kmovw %edi, %k1 6760; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6761; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6762; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6763; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6764; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6765; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6766; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6767; X64-NEXT: vmovq %xmm0, %rax 6768; X64-NEXT: vzeroupper 6769; X64-NEXT: retq 6770entry: 6771 %0 = bitcast i8 %__M to <8 x i1> 6772 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer 6773 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6774 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6775 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i 6776 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6777 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6778 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i 6779 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6780 %add7.i = add <2 x i64> %shuffle6.i, %add4.i 6781 %vecext.i = extractelement <2 x i64> %add7.i, i32 0 6782 ret i64 %vecext.i 6783} 6784 6785define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6786; X86-LABEL: test_mm512_mask_reduce_mul_epi64: 6787; X86: # %bb.0: # %entry 6788; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6789; X86-NEXT: kmovw %eax, %k1 6790; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0] 6791; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6792; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6793; X86-NEXT: vpsrlq $32, %ymm1, %ymm2 6794; X86-NEXT: vpmuludq %ymm0, %ymm2, %ymm2 6795; X86-NEXT: vpsrlq $32, %ymm0, %ymm3 6796; X86-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 6797; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6798; X86-NEXT: vpsllq $32, %ymm2, %ymm2 6799; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 6800; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6801; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6802; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 6803; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6804; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 6805; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6806; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6807; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6808; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6809; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6810; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6811; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6812; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 6813; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 6814; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 6815; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6816; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6817; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6818; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6819; X86-NEXT: vmovd %xmm0, %eax 6820; X86-NEXT: vpextrd $1, %xmm0, %edx 6821; X86-NEXT: vzeroupper 6822; X86-NEXT: retl 6823; 6824; X64-LABEL: test_mm512_mask_reduce_mul_epi64: 6825; X64: # %bb.0: # %entry 6826; X64-NEXT: kmovw %edi, %k1 6827; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1] 6828; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6829; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6830; X64-NEXT: vpsrlq $32, %ymm1, %ymm2 6831; X64-NEXT: vpmuludq %ymm0, %ymm2, %ymm2 6832; X64-NEXT: vpsrlq $32, %ymm0, %ymm3 6833; X64-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 6834; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6835; X64-NEXT: vpsllq $32, %ymm2, %ymm2 6836; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 6837; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6838; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6839; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 6840; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6841; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 6842; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6843; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6844; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6845; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6846; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6847; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6848; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 6849; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 6850; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 6851; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 6852; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6853; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6854; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6855; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6856; X64-NEXT: vmovq %xmm0, %rax 6857; X64-NEXT: vzeroupper 6858; X64-NEXT: retq 6859entry: 6860 %0 = bitcast i8 %__M to <8 x i1> 6861 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 6862 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6863 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6864 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i 6865 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6866 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6867 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i 6868 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6869 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i 6870 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0 6871 ret i64 %vecext.i 6872} 6873 6874define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6875; X86-LABEL: test_mm512_mask_reduce_and_epi64: 6876; X86: # %bb.0: # %entry 6877; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6878; X86-NEXT: kmovw %eax, %k1 6879; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 6880; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6881; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6882; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 6883; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6884; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 6885; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6886; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 6887; X86-NEXT: vmovd %xmm0, %eax 6888; X86-NEXT: vpextrd $1, %xmm0, %edx 6889; X86-NEXT: vzeroupper 6890; X86-NEXT: retl 6891; 6892; X64-LABEL: test_mm512_mask_reduce_and_epi64: 6893; X64: # %bb.0: # %entry 6894; X64-NEXT: kmovw %edi, %k1 6895; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 6896; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6897; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6898; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 6899; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6900; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 6901; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6902; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 6903; X64-NEXT: vmovq %xmm0, %rax 6904; X64-NEXT: vzeroupper 6905; X64-NEXT: retq 6906entry: 6907 %0 = bitcast i8 %__M to <8 x i1> 6908 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> 6909 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6910 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6911 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i 6912 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6913 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6914 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i 6915 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6916 %and7.i = and <2 x i64> %shuffle6.i, %and4.i 6917 %vecext.i = extractelement <2 x i64> %and7.i, i32 0 6918 ret i64 %vecext.i 6919} 6920 6921define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6922; X86-LABEL: test_mm512_mask_reduce_or_epi64: 6923; X86: # %bb.0: # %entry 6924; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6925; X86-NEXT: kmovw %eax, %k1 6926; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6927; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6928; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 6929; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6930; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 6931; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6932; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 6933; X86-NEXT: vmovd %xmm0, %eax 6934; X86-NEXT: vpextrd $1, %xmm0, %edx 6935; X86-NEXT: vzeroupper 6936; X86-NEXT: retl 6937; 6938; X64-LABEL: test_mm512_mask_reduce_or_epi64: 6939; X64: # %bb.0: # %entry 6940; X64-NEXT: kmovw %edi, %k1 6941; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6942; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6943; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 6944; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6945; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 6946; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 6947; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 6948; X64-NEXT: vmovq %xmm0, %rax 6949; X64-NEXT: vzeroupper 6950; X64-NEXT: retq 6951entry: 6952 %0 = bitcast i8 %__M to <8 x i1> 6953 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer 6954 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6955 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6956 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i 6957 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6958 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6959 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i 6960 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6961 %or7.i = or <2 x i64> %shuffle6.i, %or4.i 6962 %vecext.i = extractelement <2 x i64> %or7.i, i32 0 6963 ret i64 %vecext.i 6964} 6965 6966define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) { 6967; CHECK-LABEL: test_mm512_reduce_add_epi32: 6968; CHECK: # %bb.0: # %entry 6969; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6970; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 6971; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 6972; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 6973; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6974; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 6975; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 6976; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 6977; CHECK-NEXT: vmovd %xmm0, %eax 6978; CHECK-NEXT: vzeroupper 6979; CHECK-NEXT: ret{{[l|q]}} 6980entry: 6981 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6982 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 6983 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6984 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 6985 %add.i = add <8 x i32> %0, %1 6986 %2 = bitcast <8 x i32> %add.i to <4 x i64> 6987 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6988 %3 = bitcast <2 x i64> %extract3.i to <4 x i32> 6989 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6990 %4 = bitcast <2 x i64> %extract4.i to <4 x i32> 6991 %add5.i = add <4 x i32> %3, %4 6992 %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 6993 %add6.i = add <4 x i32> %shuffle.i, %add5.i 6994 %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 6995 %add8.i = add <4 x i32> %shuffle7.i, %add6.i 6996 %vecext.i = extractelement <4 x i32> %add8.i, i32 0 6997 ret i32 %vecext.i 6998} 6999 7000define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) { 7001; CHECK-LABEL: test_mm512_reduce_mul_epi32: 7002; CHECK: # %bb.0: # %entry 7003; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7004; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 7005; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 7006; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0 7007; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7008; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7009; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7010; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7011; CHECK-NEXT: vmovd %xmm0, %eax 7012; CHECK-NEXT: vzeroupper 7013; CHECK-NEXT: ret{{[l|q]}} 7014entry: 7015 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7016 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 7017 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7018 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 7019 %mul.i = mul <8 x i32> %0, %1 7020 %2 = bitcast <8 x i32> %mul.i to <4 x i64> 7021 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7022 %3 = bitcast <2 x i64> %extract3.i to <4 x i32> 7023 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7024 %4 = bitcast <2 x i64> %extract4.i to <4 x i32> 7025 %mul5.i = mul <4 x i32> %3, %4 7026 %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7027 %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i 7028 %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7029 %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i 7030 %vecext.i = extractelement <4 x i32> %mul8.i, i32 0 7031 ret i32 %vecext.i 7032} 7033 7034define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) { 7035; CHECK-LABEL: test_mm512_reduce_or_epi32: 7036; CHECK: # %bb.0: # %entry 7037; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7038; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 7039; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 7040; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 7041; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7042; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 7043; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7044; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 7045; CHECK-NEXT: vmovd %xmm0, %eax 7046; CHECK-NEXT: vzeroupper 7047; CHECK-NEXT: ret{{[l|q]}} 7048entry: 7049 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7050 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7051 %or25.i = or <4 x i64> %extract.i, %extract2.i 7052 %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7053 %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7054 %or526.i = or <2 x i64> %extract3.i, %extract4.i 7055 %or5.i = bitcast <2 x i64> %or526.i to <4 x i32> 7056 %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7057 %or6.i = or <4 x i32> %shuffle.i, %or5.i 7058 %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7059 %or8.i = or <4 x i32> %shuffle7.i, %or6.i 7060 %vecext.i = extractelement <4 x i32> %or8.i, i32 0 7061 ret i32 %vecext.i 7062} 7063 7064define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) { 7065; CHECK-LABEL: test_mm512_reduce_and_epi32: 7066; CHECK: # %bb.0: # %entry 7067; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7068; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 7069; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 7070; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 7071; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7072; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 7073; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7074; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 7075; CHECK-NEXT: vmovd %xmm0, %eax 7076; CHECK-NEXT: vzeroupper 7077; CHECK-NEXT: ret{{[l|q]}} 7078entry: 7079 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7080 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7081 %and25.i = and <4 x i64> %extract.i, %extract2.i 7082 %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7083 %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7084 %and526.i = and <2 x i64> %extract3.i, %extract4.i 7085 %and5.i = bitcast <2 x i64> %and526.i to <4 x i32> 7086 %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7087 %and6.i = and <4 x i32> %shuffle.i, %and5.i 7088 %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7089 %and8.i = and <4 x i32> %shuffle7.i, %and6.i 7090 %vecext.i = extractelement <4 x i32> %and8.i, i32 0 7091 ret i32 %vecext.i 7092} 7093 7094define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) { 7095; X86-LABEL: test_mm512_mask_reduce_add_epi32: 7096; X86: # %bb.0: # %entry 7097; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 7098; X86-NEXT: kmovw %eax, %k1 7099; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 7100; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7101; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 7102; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7103; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 7104; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7105; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 7106; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7107; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 7108; X86-NEXT: vmovd %xmm0, %eax 7109; X86-NEXT: vzeroupper 7110; X86-NEXT: retl 7111; 7112; X64-LABEL: test_mm512_mask_reduce_add_epi32: 7113; X64: # %bb.0: # %entry 7114; X64-NEXT: kmovw %edi, %k1 7115; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 7116; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7117; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 7118; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7119; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 7120; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7121; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 7122; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7123; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 7124; X64-NEXT: vmovd %xmm0, %eax 7125; X64-NEXT: vzeroupper 7126; X64-NEXT: retq 7127entry: 7128 %0 = bitcast <8 x i64> %__W to <16 x i32> 7129 %1 = bitcast i16 %__M to <16 x i1> 7130 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 7131 %3 = bitcast <16 x i32> %2 to <8 x i64> 7132 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7133 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 7134 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7135 %5 = bitcast <4 x i64> %extract3.i to <8 x i32> 7136 %add.i = add <8 x i32> %4, %5 7137 %6 = bitcast <8 x i32> %add.i to <4 x i64> 7138 %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7139 %7 = bitcast <2 x i64> %extract4.i to <4 x i32> 7140 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7141 %8 = bitcast <2 x i64> %extract5.i to <4 x i32> 7142 %add6.i = add <4 x i32> %7, %8 7143 %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7144 %add7.i = add <4 x i32> %shuffle.i, %add6.i 7145 %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7146 %add9.i = add <4 x i32> %shuffle8.i, %add7.i 7147 %vecext.i = extractelement <4 x i32> %add9.i, i32 0 7148 ret i32 %vecext.i 7149} 7150 7151define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) { 7152; X86-LABEL: test_mm512_mask_reduce_mul_epi32: 7153; X86: # %bb.0: # %entry 7154; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 7155; X86-NEXT: kmovw %eax, %k1 7156; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7157; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7158; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7159; X86-NEXT: vpmulld %ymm0, %ymm1, %ymm0 7160; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7161; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 7162; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7163; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7164; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7165; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7166; X86-NEXT: vmovd %xmm0, %eax 7167; X86-NEXT: vzeroupper 7168; X86-NEXT: retl 7169; 7170; X64-LABEL: test_mm512_mask_reduce_mul_epi32: 7171; X64: # %bb.0: # %entry 7172; X64-NEXT: kmovw %edi, %k1 7173; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7174; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7175; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7176; X64-NEXT: vpmulld %ymm0, %ymm1, %ymm0 7177; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7178; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 7179; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7180; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7181; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7182; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7183; X64-NEXT: vmovd %xmm0, %eax 7184; X64-NEXT: vzeroupper 7185; X64-NEXT: retq 7186entry: 7187 %0 = bitcast <8 x i64> %__W to <16 x i32> 7188 %1 = bitcast i16 %__M to <16 x i1> 7189 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 7190 %3 = bitcast <16 x i32> %2 to <8 x i64> 7191 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7192 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 7193 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7194 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 7195 %mul.i = mul <8 x i32> %4, %5 7196 %6 = bitcast <8 x i32> %mul.i to <4 x i64> 7197 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7198 %7 = bitcast <2 x i64> %extract5.i to <4 x i32> 7199 %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7200 %8 = bitcast <2 x i64> %extract6.i to <4 x i32> 7201 %mul7.i = mul <4 x i32> %7, %8 7202 %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7203 %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i 7204 %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7205 %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i 7206 %vecext.i = extractelement <4 x i32> %mul10.i, i32 0 7207 ret i32 %vecext.i 7208} 7209 7210define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) { 7211; X86-LABEL: test_mm512_mask_reduce_and_epi32: 7212; X86: # %bb.0: # %entry 7213; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 7214; X86-NEXT: kmovw %eax, %k1 7215; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 7216; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7217; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7218; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 7219; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7220; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 7221; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7222; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 7223; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7224; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 7225; X86-NEXT: vmovd %xmm0, %eax 7226; X86-NEXT: vzeroupper 7227; X86-NEXT: retl 7228; 7229; X64-LABEL: test_mm512_mask_reduce_and_epi32: 7230; X64: # %bb.0: # %entry 7231; X64-NEXT: kmovw %edi, %k1 7232; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 7233; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7234; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7235; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 7236; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7237; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 7238; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7239; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 7240; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7241; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 7242; X64-NEXT: vmovd %xmm0, %eax 7243; X64-NEXT: vzeroupper 7244; X64-NEXT: retq 7245entry: 7246 %0 = bitcast <8 x i64> %__W to <16 x i32> 7247 %1 = bitcast i16 %__M to <16 x i1> 7248 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 7249 %3 = bitcast <16 x i32> %2 to <8 x i64> 7250 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7251 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7252 %and28.i = and <4 x i64> %extract.i, %extract4.i 7253 %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7254 %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7255 %and729.i = and <2 x i64> %extract5.i, %extract6.i 7256 %and7.i = bitcast <2 x i64> %and729.i to <4 x i32> 7257 %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7258 %and8.i = and <4 x i32> %shuffle.i, %and7.i 7259 %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7260 %and10.i = and <4 x i32> %shuffle9.i, %and8.i 7261 %vecext.i = extractelement <4 x i32> %and10.i, i32 0 7262 ret i32 %vecext.i 7263} 7264 7265define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) { 7266; X86-LABEL: test_mm512_mask_reduce_or_epi32: 7267; X86: # %bb.0: # %entry 7268; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 7269; X86-NEXT: kmovw %eax, %k1 7270; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 7271; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7272; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 7273; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7274; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 7275; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7276; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 7277; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7278; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 7279; X86-NEXT: vmovd %xmm0, %eax 7280; X86-NEXT: vzeroupper 7281; X86-NEXT: retl 7282; 7283; X64-LABEL: test_mm512_mask_reduce_or_epi32: 7284; X64: # %bb.0: # %entry 7285; X64-NEXT: kmovw %edi, %k1 7286; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 7287; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7288; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 7289; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7290; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 7291; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7292; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 7293; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 7294; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 7295; X64-NEXT: vmovd %xmm0, %eax 7296; X64-NEXT: vzeroupper 7297; X64-NEXT: retq 7298entry: 7299 %0 = bitcast <8 x i64> %__W to <16 x i32> 7300 %1 = bitcast i16 %__M to <16 x i1> 7301 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 7302 %3 = bitcast <16 x i32> %2 to <8 x i64> 7303 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7304 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7305 %or27.i = or <4 x i64> %extract.i, %extract3.i 7306 %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7307 %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7308 %or628.i = or <2 x i64> %extract4.i, %extract5.i 7309 %or6.i = bitcast <2 x i64> %or628.i to <4 x i32> 7310 %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7311 %or7.i = or <4 x i32> %shuffle.i, %or6.i 7312 %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7313 %or9.i = or <4 x i32> %shuffle8.i, %or7.i 7314 %vecext.i = extractelement <4 x i32> %or9.i, i32 0 7315 ret i32 %vecext.i 7316} 7317 7318define double @test_mm512_reduce_add_pd(<8 x double> %__W) { 7319; X86-LABEL: test_mm512_reduce_add_pd: 7320; X86: # %bb.0: # %entry 7321; X86-NEXT: pushl %ebp 7322; X86-NEXT: .cfi_def_cfa_offset 8 7323; X86-NEXT: .cfi_offset %ebp, -8 7324; X86-NEXT: movl %esp, %ebp 7325; X86-NEXT: .cfi_def_cfa_register %ebp 7326; X86-NEXT: andl $-8, %esp 7327; X86-NEXT: subl $8, %esp 7328; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7329; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7330; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7331; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7332; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7333; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 7334; X86-NEXT: vmovsd %xmm0, (%esp) 7335; X86-NEXT: fldl (%esp) 7336; X86-NEXT: movl %ebp, %esp 7337; X86-NEXT: popl %ebp 7338; X86-NEXT: .cfi_def_cfa %esp, 4 7339; X86-NEXT: vzeroupper 7340; X86-NEXT: retl 7341; 7342; X64-LABEL: test_mm512_reduce_add_pd: 7343; X64: # %bb.0: # %entry 7344; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7345; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7346; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7347; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7348; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7349; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 7350; X64-NEXT: vzeroupper 7351; X64-NEXT: retq 7352entry: 7353 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7354 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7355 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i 7356 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7357 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7358 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i 7359 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7360 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i 7361 %vecext.i = extractelement <2 x double> %add7.i, i32 0 7362 ret double %vecext.i 7363} 7364 7365define double @test_mm512_reduce_mul_pd(<8 x double> %__W) { 7366; X86-LABEL: test_mm512_reduce_mul_pd: 7367; X86: # %bb.0: # %entry 7368; X86-NEXT: pushl %ebp 7369; X86-NEXT: .cfi_def_cfa_offset 8 7370; X86-NEXT: .cfi_offset %ebp, -8 7371; X86-NEXT: movl %esp, %ebp 7372; X86-NEXT: .cfi_def_cfa_register %ebp 7373; X86-NEXT: andl $-8, %esp 7374; X86-NEXT: subl $8, %esp 7375; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7376; X86-NEXT: vmulpd %ymm1, %ymm0, %ymm0 7377; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7378; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7379; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7380; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 7381; X86-NEXT: vmovsd %xmm0, (%esp) 7382; X86-NEXT: fldl (%esp) 7383; X86-NEXT: movl %ebp, %esp 7384; X86-NEXT: popl %ebp 7385; X86-NEXT: .cfi_def_cfa %esp, 4 7386; X86-NEXT: vzeroupper 7387; X86-NEXT: retl 7388; 7389; X64-LABEL: test_mm512_reduce_mul_pd: 7390; X64: # %bb.0: # %entry 7391; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7392; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0 7393; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7394; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7395; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7396; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 7397; X64-NEXT: vzeroupper 7398; X64-NEXT: retq 7399entry: 7400 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7401 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7402 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i 7403 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7404 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7405 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i 7406 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7407 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i 7408 %vecext.i = extractelement <2 x double> %mul7.i, i32 0 7409 ret double %vecext.i 7410} 7411 7412define float @test_mm512_reduce_add_ps(<16 x float> %__W) { 7413; X86-LABEL: test_mm512_reduce_add_ps: 7414; X86: # %bb.0: # %entry 7415; X86-NEXT: pushl %eax 7416; X86-NEXT: .cfi_def_cfa_offset 8 7417; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7418; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 7419; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7420; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7421; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7422; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7423; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 7424; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 7425; X86-NEXT: vmovss %xmm0, (%esp) 7426; X86-NEXT: flds (%esp) 7427; X86-NEXT: popl %eax 7428; X86-NEXT: .cfi_def_cfa_offset 4 7429; X86-NEXT: vzeroupper 7430; X86-NEXT: retl 7431; 7432; X64-LABEL: test_mm512_reduce_add_ps: 7433; X64: # %bb.0: # %entry 7434; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7435; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 7436; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7437; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7438; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7439; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7440; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 7441; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 7442; X64-NEXT: vzeroupper 7443; X64-NEXT: retq 7444entry: 7445 %0 = bitcast <16 x float> %__W to <8 x double> 7446 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7447 %1 = bitcast <4 x double> %extract.i to <8 x float> 7448 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7449 %2 = bitcast <4 x double> %extract2.i to <8 x float> 7450 %add.i = fadd <8 x float> %1, %2 7451 %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7452 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7453 %add5.i = fadd <4 x float> %extract3.i, %extract4.i 7454 %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7455 %add6.i = fadd <4 x float> %add5.i, %shuffle.i 7456 %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7457 %add8.i = fadd <4 x float> %add6.i, %shuffle7.i 7458 %vecext.i = extractelement <4 x float> %add8.i, i32 0 7459 ret float %vecext.i 7460} 7461 7462define float @test_mm512_reduce_mul_ps(<16 x float> %__W) { 7463; X86-LABEL: test_mm512_reduce_mul_ps: 7464; X86: # %bb.0: # %entry 7465; X86-NEXT: pushl %eax 7466; X86-NEXT: .cfi_def_cfa_offset 8 7467; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7468; X86-NEXT: vmulps %ymm1, %ymm0, %ymm0 7469; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7470; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7471; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7472; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7473; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 7474; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 7475; X86-NEXT: vmovss %xmm0, (%esp) 7476; X86-NEXT: flds (%esp) 7477; X86-NEXT: popl %eax 7478; X86-NEXT: .cfi_def_cfa_offset 4 7479; X86-NEXT: vzeroupper 7480; X86-NEXT: retl 7481; 7482; X64-LABEL: test_mm512_reduce_mul_ps: 7483; X64: # %bb.0: # %entry 7484; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7485; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0 7486; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7487; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7488; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7489; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7490; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 7491; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 7492; X64-NEXT: vzeroupper 7493; X64-NEXT: retq 7494entry: 7495 %0 = bitcast <16 x float> %__W to <8 x double> 7496 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7497 %1 = bitcast <4 x double> %extract.i to <8 x float> 7498 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7499 %2 = bitcast <4 x double> %extract2.i to <8 x float> 7500 %mul.i = fmul <8 x float> %1, %2 7501 %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7502 %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7503 %mul5.i = fmul <4 x float> %extract3.i, %extract4.i 7504 %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7505 %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i 7506 %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7507 %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i 7508 %vecext.i = extractelement <4 x float> %mul8.i, i32 0 7509 ret float %vecext.i 7510} 7511 7512define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) { 7513; X86-LABEL: test_mm512_mask_reduce_add_pd: 7514; X86: # %bb.0: # %entry 7515; X86-NEXT: pushl %ebp 7516; X86-NEXT: .cfi_def_cfa_offset 8 7517; X86-NEXT: .cfi_offset %ebp, -8 7518; X86-NEXT: movl %esp, %ebp 7519; X86-NEXT: .cfi_def_cfa_register %ebp 7520; X86-NEXT: andl $-8, %esp 7521; X86-NEXT: subl $8, %esp 7522; X86-NEXT: movb 8(%ebp), %al 7523; X86-NEXT: kmovw %eax, %k1 7524; X86-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} 7525; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7526; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7527; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7528; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7529; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7530; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 7531; X86-NEXT: vmovsd %xmm0, (%esp) 7532; X86-NEXT: fldl (%esp) 7533; X86-NEXT: movl %ebp, %esp 7534; X86-NEXT: popl %ebp 7535; X86-NEXT: .cfi_def_cfa %esp, 4 7536; X86-NEXT: vzeroupper 7537; X86-NEXT: retl 7538; 7539; X64-LABEL: test_mm512_mask_reduce_add_pd: 7540; X64: # %bb.0: # %entry 7541; X64-NEXT: kmovw %edi, %k1 7542; X64-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} 7543; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7544; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7545; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7546; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7547; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7548; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 7549; X64-NEXT: vzeroupper 7550; X64-NEXT: retq 7551entry: 7552 %0 = bitcast i8 %__M to <8 x i1> 7553 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer 7554 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7555 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7556 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i 7557 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7558 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7559 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i 7560 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7561 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i 7562 %vecext.i = extractelement <2 x double> %add7.i, i32 0 7563 ret double %vecext.i 7564} 7565 7566define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) { 7567; X86-LABEL: test_mm512_mask_reduce_mul_pd: 7568; X86: # %bb.0: # %entry 7569; X86-NEXT: pushl %ebp 7570; X86-NEXT: .cfi_def_cfa_offset 8 7571; X86-NEXT: .cfi_offset %ebp, -8 7572; X86-NEXT: movl %esp, %ebp 7573; X86-NEXT: .cfi_def_cfa_register %ebp 7574; X86-NEXT: andl $-8, %esp 7575; X86-NEXT: subl $8, %esp 7576; X86-NEXT: movb 8(%ebp), %al 7577; X86-NEXT: kmovw %eax, %k1 7578; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 7579; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1} 7580; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7581; X86-NEXT: vmulpd %ymm0, %ymm1, %ymm0 7582; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7583; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7584; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7585; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 7586; X86-NEXT: vmovsd %xmm0, (%esp) 7587; X86-NEXT: fldl (%esp) 7588; X86-NEXT: movl %ebp, %esp 7589; X86-NEXT: popl %ebp 7590; X86-NEXT: .cfi_def_cfa %esp, 4 7591; X86-NEXT: vzeroupper 7592; X86-NEXT: retl 7593; 7594; X64-LABEL: test_mm512_mask_reduce_mul_pd: 7595; X64: # %bb.0: # %entry 7596; X64-NEXT: kmovw %edi, %k1 7597; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 7598; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1} 7599; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7600; X64-NEXT: vmulpd %ymm0, %ymm1, %ymm0 7601; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7602; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7603; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7604; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 7605; X64-NEXT: vzeroupper 7606; X64-NEXT: retq 7607entry: 7608 %0 = bitcast i8 %__M to <8 x i1> 7609 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00> 7610 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7611 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7612 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i 7613 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7614 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7615 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i 7616 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7617 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i 7618 %vecext.i = extractelement <2 x double> %mul7.i, i32 0 7619 ret double %vecext.i 7620} 7621 7622define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) { 7623; X86-LABEL: test_mm512_mask_reduce_add_ps: 7624; X86: # %bb.0: # %entry 7625; X86-NEXT: pushl %eax 7626; X86-NEXT: .cfi_def_cfa_offset 8 7627; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 7628; X86-NEXT: kmovw %eax, %k1 7629; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} 7630; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7631; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 7632; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7633; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7634; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7635; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7636; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 7637; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 7638; X86-NEXT: vmovss %xmm0, (%esp) 7639; X86-NEXT: flds (%esp) 7640; X86-NEXT: popl %eax 7641; X86-NEXT: .cfi_def_cfa_offset 4 7642; X86-NEXT: vzeroupper 7643; X86-NEXT: retl 7644; 7645; X64-LABEL: test_mm512_mask_reduce_add_ps: 7646; X64: # %bb.0: # %entry 7647; X64-NEXT: kmovw %edi, %k1 7648; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} 7649; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7650; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 7651; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7652; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7653; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7654; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7655; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 7656; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 7657; X64-NEXT: vzeroupper 7658; X64-NEXT: retq 7659entry: 7660 %0 = bitcast i16 %__M to <16 x i1> 7661 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer 7662 %2 = bitcast <16 x float> %1 to <8 x double> 7663 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7664 %3 = bitcast <4 x double> %extract.i to <8 x float> 7665 %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7666 %4 = bitcast <4 x double> %extract3.i to <8 x float> 7667 %add.i = fadd <8 x float> %3, %4 7668 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7669 %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7670 %add6.i = fadd <4 x float> %extract4.i, %extract5.i 7671 %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7672 %add7.i = fadd <4 x float> %add6.i, %shuffle.i 7673 %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7674 %add9.i = fadd <4 x float> %add7.i, %shuffle8.i 7675 %vecext.i = extractelement <4 x float> %add9.i, i32 0 7676 ret float %vecext.i 7677} 7678 7679define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) { 7680; X86-LABEL: test_mm512_mask_reduce_mul_ps: 7681; X86: # %bb.0: # %entry 7682; X86-NEXT: pushl %eax 7683; X86-NEXT: .cfi_def_cfa_offset 8 7684; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 7685; X86-NEXT: kmovw %eax, %k1 7686; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 7687; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} 7688; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7689; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0 7690; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7691; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7692; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7693; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7694; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 7695; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 7696; X86-NEXT: vmovss %xmm0, (%esp) 7697; X86-NEXT: flds (%esp) 7698; X86-NEXT: popl %eax 7699; X86-NEXT: .cfi_def_cfa_offset 4 7700; X86-NEXT: vzeroupper 7701; X86-NEXT: retl 7702; 7703; X64-LABEL: test_mm512_mask_reduce_mul_ps: 7704; X64: # %bb.0: # %entry 7705; X64-NEXT: kmovw %edi, %k1 7706; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 7707; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1} 7708; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7709; X64-NEXT: vmulps %ymm0, %ymm1, %ymm0 7710; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7711; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7712; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7713; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7714; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 7715; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 7716; X64-NEXT: vzeroupper 7717; X64-NEXT: retq 7718entry: 7719 %0 = bitcast i16 %__M to <16 x i1> 7720 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> 7721 %2 = bitcast <16 x float> %1 to <8 x double> 7722 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7723 %3 = bitcast <4 x double> %extract.i to <8 x float> 7724 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7725 %4 = bitcast <4 x double> %extract4.i to <8 x float> 7726 %mul.i = fmul <8 x float> %3, %4 7727 %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7728 %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7729 %mul7.i = fmul <4 x float> %extract5.i, %extract6.i 7730 %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7731 %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i 7732 %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7733 %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i 7734 %vecext.i = extractelement <4 x float> %mul10.i, i32 0 7735 ret float %vecext.i 7736} 7737 7738define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) { 7739; X86-LABEL: test_mm512_reduce_max_epi64: 7740; X86: # %bb.0: # %entry 7741; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7742; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7743; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7744; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7745; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7746; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7747; X86-NEXT: vmovd %xmm0, %eax 7748; X86-NEXT: vpextrd $1, %xmm0, %edx 7749; X86-NEXT: vzeroupper 7750; X86-NEXT: retl 7751; 7752; X64-LABEL: test_mm512_reduce_max_epi64: 7753; X64: # %bb.0: # %entry 7754; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7755; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7756; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7757; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7758; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7759; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7760; X64-NEXT: vmovq %xmm0, %rax 7761; X64-NEXT: vzeroupper 7762; X64-NEXT: retq 7763entry: 7764 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7765 %0 = icmp slt <8 x i64> %shuffle.i, %__W 7766 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7767 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7768 %2 = icmp sgt <8 x i64> %1, %shuffle1.i 7769 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7770 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7771 %4 = icmp sgt <8 x i64> %3, %shuffle3.i 7772 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7773 %vecext.i = extractelement <8 x i64> %5, i32 0 7774 ret i64 %vecext.i 7775} 7776 7777define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) { 7778; X86-LABEL: test_mm512_reduce_max_epu64: 7779; X86: # %bb.0: # %entry 7780; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7781; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 7782; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7783; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7784; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7785; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7786; X86-NEXT: vmovd %xmm0, %eax 7787; X86-NEXT: vpextrd $1, %xmm0, %edx 7788; X86-NEXT: vzeroupper 7789; X86-NEXT: retl 7790; 7791; X64-LABEL: test_mm512_reduce_max_epu64: 7792; X64: # %bb.0: # %entry 7793; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7794; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 7795; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7796; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7797; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7798; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7799; X64-NEXT: vmovq %xmm0, %rax 7800; X64-NEXT: vzeroupper 7801; X64-NEXT: retq 7802entry: 7803 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7804 %0 = icmp ult <8 x i64> %shuffle.i, %__W 7805 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7806 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7807 %2 = icmp ugt <8 x i64> %1, %shuffle1.i 7808 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7809 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7810 %4 = icmp ugt <8 x i64> %3, %shuffle3.i 7811 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7812 %vecext.i = extractelement <8 x i64> %5, i32 0 7813 ret i64 %vecext.i 7814} 7815 7816define double @test_mm512_reduce_max_pd(<8 x double> %__W) { 7817; X86-LABEL: test_mm512_reduce_max_pd: 7818; X86: # %bb.0: # %entry 7819; X86-NEXT: pushl %ebp 7820; X86-NEXT: .cfi_def_cfa_offset 8 7821; X86-NEXT: .cfi_offset %ebp, -8 7822; X86-NEXT: movl %esp, %ebp 7823; X86-NEXT: .cfi_def_cfa_register %ebp 7824; X86-NEXT: andl $-8, %esp 7825; X86-NEXT: subl $8, %esp 7826; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7827; X86-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 7828; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7829; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7830; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7831; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 7832; X86-NEXT: vmovsd %xmm0, (%esp) 7833; X86-NEXT: fldl (%esp) 7834; X86-NEXT: movl %ebp, %esp 7835; X86-NEXT: popl %ebp 7836; X86-NEXT: .cfi_def_cfa %esp, 4 7837; X86-NEXT: vzeroupper 7838; X86-NEXT: retl 7839; 7840; X64-LABEL: test_mm512_reduce_max_pd: 7841; X64: # %bb.0: # %entry 7842; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7843; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 7844; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7845; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7846; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7847; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 7848; X64-NEXT: vzeroupper 7849; X64-NEXT: retq 7850entry: 7851 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7852 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7853 %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i) 7854 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7855 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7856 %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i) 7857 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7858 %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i) 7859 %vecext.i = extractelement <2 x double> %2, i32 0 7860 ret double %vecext.i 7861} 7862 7863define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) { 7864; X86-LABEL: test_mm512_reduce_min_epi64: 7865; X86: # %bb.0: # %entry 7866; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7867; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 7868; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7869; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7870; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7871; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7872; X86-NEXT: vmovd %xmm0, %eax 7873; X86-NEXT: vpextrd $1, %xmm0, %edx 7874; X86-NEXT: vzeroupper 7875; X86-NEXT: retl 7876; 7877; X64-LABEL: test_mm512_reduce_min_epi64: 7878; X64: # %bb.0: # %entry 7879; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7880; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0 7881; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7882; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7883; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7884; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7885; X64-NEXT: vmovq %xmm0, %rax 7886; X64-NEXT: vzeroupper 7887; X64-NEXT: retq 7888entry: 7889 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7890 %0 = icmp sgt <8 x i64> %shuffle.i, %__W 7891 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7892 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7893 %2 = icmp slt <8 x i64> %1, %shuffle1.i 7894 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7895 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7896 %4 = icmp slt <8 x i64> %3, %shuffle3.i 7897 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7898 %vecext.i = extractelement <8 x i64> %5, i32 0 7899 ret i64 %vecext.i 7900} 7901 7902define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) { 7903; X86-LABEL: test_mm512_reduce_min_epu64: 7904; X86: # %bb.0: # %entry 7905; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7906; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 7907; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7908; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7909; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7910; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7911; X86-NEXT: vmovd %xmm0, %eax 7912; X86-NEXT: vpextrd $1, %xmm0, %edx 7913; X86-NEXT: vzeroupper 7914; X86-NEXT: retl 7915; 7916; X64-LABEL: test_mm512_reduce_min_epu64: 7917; X64: # %bb.0: # %entry 7918; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7919; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 7920; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7921; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7922; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7923; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7924; X64-NEXT: vmovq %xmm0, %rax 7925; X64-NEXT: vzeroupper 7926; X64-NEXT: retq 7927entry: 7928 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7929 %0 = icmp ugt <8 x i64> %shuffle.i, %__W 7930 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7931 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7932 %2 = icmp ult <8 x i64> %1, %shuffle1.i 7933 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7934 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7935 %4 = icmp ult <8 x i64> %3, %shuffle3.i 7936 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7937 %vecext.i = extractelement <8 x i64> %5, i32 0 7938 ret i64 %vecext.i 7939} 7940 7941define double @test_mm512_reduce_min_pd(<8 x double> %__W) { 7942; X86-LABEL: test_mm512_reduce_min_pd: 7943; X86: # %bb.0: # %entry 7944; X86-NEXT: pushl %ebp 7945; X86-NEXT: .cfi_def_cfa_offset 8 7946; X86-NEXT: .cfi_offset %ebp, -8 7947; X86-NEXT: movl %esp, %ebp 7948; X86-NEXT: .cfi_def_cfa_register %ebp 7949; X86-NEXT: andl $-8, %esp 7950; X86-NEXT: subl $8, %esp 7951; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7952; X86-NEXT: vminpd %ymm1, %ymm0, %ymm0 7953; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7954; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 7955; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7956; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0 7957; X86-NEXT: vmovsd %xmm0, (%esp) 7958; X86-NEXT: fldl (%esp) 7959; X86-NEXT: movl %ebp, %esp 7960; X86-NEXT: popl %ebp 7961; X86-NEXT: .cfi_def_cfa %esp, 4 7962; X86-NEXT: vzeroupper 7963; X86-NEXT: retl 7964; 7965; X64-LABEL: test_mm512_reduce_min_pd: 7966; X64: # %bb.0: # %entry 7967; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7968; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0 7969; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7970; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 7971; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7972; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0 7973; X64-NEXT: vzeroupper 7974; X64-NEXT: retq 7975entry: 7976 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7977 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7978 %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i) 7979 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7980 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7981 %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i) 7982 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7983 %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i) 7984 %vecext.i = extractelement <2 x double> %2, i32 0 7985 ret double %vecext.i 7986} 7987 7988define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) { 7989; X86-LABEL: test_mm512_mask_reduce_max_epi64: 7990; X86: # %bb.0: # %entry 7991; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7992; X86-NEXT: kmovw %eax, %k1 7993; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648] 7994; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 7995; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 7996; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7997; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7998; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7999; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8000; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 8001; X86-NEXT: vmovd %xmm0, %eax 8002; X86-NEXT: vpextrd $1, %xmm0, %edx 8003; X86-NEXT: vzeroupper 8004; X86-NEXT: retl 8005; 8006; X64-LABEL: test_mm512_mask_reduce_max_epi64: 8007; X64: # %bb.0: # %entry 8008; X64-NEXT: kmovw %edi, %k1 8009; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 8010; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8011; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8012; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 8013; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8014; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 8015; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8016; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 8017; X64-NEXT: vmovq %xmm0, %rax 8018; X64-NEXT: vzeroupper 8019; X64-NEXT: retq 8020entry: 8021 %0 = bitcast i8 %__M to <8 x i1> 8022 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808> 8023 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 8024 %2 = icmp sgt <8 x i64> %1, %shuffle.i 8025 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 8026 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 8027 %4 = icmp sgt <8 x i64> %3, %shuffle3.i 8028 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 8029 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 8030 %6 = icmp sgt <8 x i64> %5, %shuffle5.i 8031 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i 8032 %vecext.i = extractelement <8 x i64> %7, i32 0 8033 ret i64 %vecext.i 8034} 8035 8036define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) { 8037; X86-LABEL: test_mm512_mask_reduce_max_epu64: 8038; X86: # %bb.0: # %entry 8039; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8040; X86-NEXT: kmovw %eax, %k1 8041; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 8042; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 8043; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 8044; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8045; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 8046; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8047; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 8048; X86-NEXT: vmovd %xmm0, %eax 8049; X86-NEXT: vpextrd $1, %xmm0, %edx 8050; X86-NEXT: vzeroupper 8051; X86-NEXT: retl 8052; 8053; X64-LABEL: test_mm512_mask_reduce_max_epu64: 8054; X64: # %bb.0: # %entry 8055; X64-NEXT: kmovw %edi, %k1 8056; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 8057; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 8058; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 8059; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8060; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 8061; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8062; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 8063; X64-NEXT: vmovq %xmm0, %rax 8064; X64-NEXT: vzeroupper 8065; X64-NEXT: retq 8066entry: 8067 %0 = bitcast i8 %__M to <8 x i1> 8068 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer 8069 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 8070 %2 = icmp ugt <8 x i64> %1, %shuffle.i 8071 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 8072 %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 8073 %4 = icmp ugt <8 x i64> %3, %shuffle2.i 8074 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i 8075 %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 8076 %6 = icmp ugt <8 x i64> %5, %shuffle4.i 8077 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i 8078 %vecext.i = extractelement <8 x i64> %7, i32 0 8079 ret i64 %vecext.i 8080} 8081 8082define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) { 8083; X86-LABEL: test_mm512_mask_reduce_max_pd: 8084; X86: # %bb.0: # %entry 8085; X86-NEXT: pushl %ebp 8086; X86-NEXT: .cfi_def_cfa_offset 8 8087; X86-NEXT: .cfi_offset %ebp, -8 8088; X86-NEXT: movl %esp, %ebp 8089; X86-NEXT: .cfi_def_cfa_register %ebp 8090; X86-NEXT: andl $-8, %esp 8091; X86-NEXT: subl $8, %esp 8092; X86-NEXT: movb 8(%ebp), %al 8093; X86-NEXT: kmovw %eax, %k1 8094; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 8095; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1} 8096; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8097; X86-NEXT: vmaxpd %ymm0, %ymm1, %ymm0 8098; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8099; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 8100; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8101; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 8102; X86-NEXT: vmovsd %xmm0, (%esp) 8103; X86-NEXT: fldl (%esp) 8104; X86-NEXT: movl %ebp, %esp 8105; X86-NEXT: popl %ebp 8106; X86-NEXT: .cfi_def_cfa %esp, 4 8107; X86-NEXT: vzeroupper 8108; X86-NEXT: retl 8109; 8110; X64-LABEL: test_mm512_mask_reduce_max_pd: 8111; X64: # %bb.0: # %entry 8112; X64-NEXT: kmovw %edi, %k1 8113; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 8114; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1} 8115; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8116; X64-NEXT: vmaxpd %ymm0, %ymm1, %ymm0 8117; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8118; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 8119; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8120; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 8121; X64-NEXT: vzeroupper 8122; X64-NEXT: retq 8123entry: 8124 %0 = bitcast i8 %__M to <8 x i1> 8125 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000> 8126 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8127 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8128 %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3 8129 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1> 8130 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3> 8131 %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3 8132 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0> 8133 %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3 8134 %vecext.i = extractelement <2 x double> %4, i32 0 8135 ret double %vecext.i 8136} 8137 8138define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) { 8139; X86-LABEL: test_mm512_mask_reduce_min_epi64: 8140; X86: # %bb.0: # %entry 8141; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8142; X86-NEXT: kmovw %eax, %k1 8143; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647] 8144; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8145; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8146; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 8147; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8148; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8149; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8150; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8151; X86-NEXT: vmovd %xmm0, %eax 8152; X86-NEXT: vpextrd $1, %xmm0, %edx 8153; X86-NEXT: vzeroupper 8154; X86-NEXT: retl 8155; 8156; X64-LABEL: test_mm512_mask_reduce_min_epi64: 8157; X64: # %bb.0: # %entry 8158; X64-NEXT: kmovw %edi, %k1 8159; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] 8160; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8161; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8162; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0 8163; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8164; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8165; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8166; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8167; X64-NEXT: vmovq %xmm0, %rax 8168; X64-NEXT: vzeroupper 8169; X64-NEXT: retq 8170entry: 8171 %0 = bitcast i8 %__M to <8 x i1> 8172 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> 8173 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 8174 %2 = icmp slt <8 x i64> %1, %shuffle.i 8175 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 8176 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 8177 %4 = icmp slt <8 x i64> %3, %shuffle3.i 8178 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 8179 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 8180 %6 = icmp slt <8 x i64> %5, %shuffle5.i 8181 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i 8182 %vecext.i = extractelement <8 x i64> %7, i32 0 8183 ret i64 %vecext.i 8184} 8185 8186define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) { 8187; X86-LABEL: test_mm512_mask_reduce_min_epu64: 8188; X86: # %bb.0: # %entry 8189; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8190; X86-NEXT: kmovw %eax, %k1 8191; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8192; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8193; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8194; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 8195; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8196; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8197; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8198; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8199; X86-NEXT: vmovd %xmm0, %eax 8200; X86-NEXT: vpextrd $1, %xmm0, %edx 8201; X86-NEXT: vzeroupper 8202; X86-NEXT: retl 8203; 8204; X64-LABEL: test_mm512_mask_reduce_min_epu64: 8205; X64: # %bb.0: # %entry 8206; X64-NEXT: kmovw %edi, %k1 8207; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8208; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8209; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8210; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 8211; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8212; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8213; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8214; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8215; X64-NEXT: vmovq %xmm0, %rax 8216; X64-NEXT: vzeroupper 8217; X64-NEXT: retq 8218entry: 8219 %0 = bitcast i8 %__M to <8 x i1> 8220 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> 8221 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 8222 %2 = icmp ult <8 x i64> %1, %shuffle.i 8223 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 8224 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 8225 %4 = icmp ult <8 x i64> %3, %shuffle3.i 8226 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 8227 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 8228 %6 = icmp ult <8 x i64> %5, %shuffle5.i 8229 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i 8230 %vecext.i = extractelement <8 x i64> %7, i32 0 8231 ret i64 %vecext.i 8232} 8233 8234define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) { 8235; X86-LABEL: test_mm512_mask_reduce_min_pd: 8236; X86: # %bb.0: # %entry 8237; X86-NEXT: pushl %ebp 8238; X86-NEXT: .cfi_def_cfa_offset 8 8239; X86-NEXT: .cfi_offset %ebp, -8 8240; X86-NEXT: movl %esp, %ebp 8241; X86-NEXT: .cfi_def_cfa_register %ebp 8242; X86-NEXT: andl $-8, %esp 8243; X86-NEXT: subl $8, %esp 8244; X86-NEXT: movb 8(%ebp), %al 8245; X86-NEXT: kmovw %eax, %k1 8246; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8247; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1} 8248; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8249; X86-NEXT: vminpd %ymm0, %ymm1, %ymm0 8250; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8251; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 8252; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8253; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0 8254; X86-NEXT: vmovsd %xmm0, (%esp) 8255; X86-NEXT: fldl (%esp) 8256; X86-NEXT: movl %ebp, %esp 8257; X86-NEXT: popl %ebp 8258; X86-NEXT: .cfi_def_cfa %esp, 4 8259; X86-NEXT: vzeroupper 8260; X86-NEXT: retl 8261; 8262; X64-LABEL: test_mm512_mask_reduce_min_pd: 8263; X64: # %bb.0: # %entry 8264; X64-NEXT: kmovw %edi, %k1 8265; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8266; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1} 8267; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8268; X64-NEXT: vminpd %ymm0, %ymm1, %ymm0 8269; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8270; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 8271; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8272; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0 8273; X64-NEXT: vzeroupper 8274; X64-NEXT: retq 8275entry: 8276 %0 = bitcast i8 %__M to <8 x i1> 8277 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000> 8278 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8279 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8280 %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) 8281 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1> 8282 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3> 8283 %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) 8284 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0> 8285 %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i) 8286 %vecext.i = extractelement <2 x double> %4, i32 0 8287 ret double %vecext.i 8288} 8289 8290define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) { 8291; CHECK-LABEL: test_mm512_reduce_max_epi32: 8292; CHECK: # %bb.0: # %entry 8293; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8294; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 8295; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8296; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8297; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8298; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8299; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8300; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8301; CHECK-NEXT: vmovd %xmm0, %eax 8302; CHECK-NEXT: vzeroupper 8303; CHECK-NEXT: ret{{[l|q]}} 8304entry: 8305 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8306 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8307 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8308 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8309 %2 = icmp sgt <8 x i32> %0, %1 8310 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8311 %4 = bitcast <8 x i32> %3 to <4 x i64> 8312 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8313 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8314 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8315 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8316 %7 = icmp sgt <4 x i32> %5, %6 8317 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8318 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8319 %9 = icmp sgt <4 x i32> %8, %shuffle.i 8320 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8321 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8322 %11 = icmp sgt <4 x i32> %10, %shuffle8.i 8323 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8324 %vecext.i = extractelement <4 x i32> %12, i32 0 8325 ret i32 %vecext.i 8326} 8327 8328define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) { 8329; CHECK-LABEL: test_mm512_reduce_max_epu32: 8330; CHECK: # %bb.0: # %entry 8331; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8332; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 8333; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8334; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8335; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8336; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8337; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8338; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8339; CHECK-NEXT: vmovd %xmm0, %eax 8340; CHECK-NEXT: vzeroupper 8341; CHECK-NEXT: ret{{[l|q]}} 8342entry: 8343 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8344 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8345 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8346 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8347 %2 = icmp ugt <8 x i32> %0, %1 8348 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8349 %4 = bitcast <8 x i32> %3 to <4 x i64> 8350 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8351 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8352 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8353 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8354 %7 = icmp ugt <4 x i32> %5, %6 8355 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8356 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8357 %9 = icmp ugt <4 x i32> %8, %shuffle.i 8358 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8359 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8360 %11 = icmp ugt <4 x i32> %10, %shuffle8.i 8361 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8362 %vecext.i = extractelement <4 x i32> %12, i32 0 8363 ret i32 %vecext.i 8364} 8365 8366define float @test_mm512_reduce_max_ps(<16 x float> %__W) { 8367; X86-LABEL: test_mm512_reduce_max_ps: 8368; X86: # %bb.0: # %entry 8369; X86-NEXT: pushl %eax 8370; X86-NEXT: .cfi_def_cfa_offset 8 8371; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8372; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0 8373; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8374; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8375; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8376; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8377; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 8378; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 8379; X86-NEXT: vmovss %xmm0, (%esp) 8380; X86-NEXT: flds (%esp) 8381; X86-NEXT: popl %eax 8382; X86-NEXT: .cfi_def_cfa_offset 4 8383; X86-NEXT: vzeroupper 8384; X86-NEXT: retl 8385; 8386; X64-LABEL: test_mm512_reduce_max_ps: 8387; X64: # %bb.0: # %entry 8388; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8389; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0 8390; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8391; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8392; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8393; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8394; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 8395; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0 8396; X64-NEXT: vzeroupper 8397; X64-NEXT: retq 8398entry: 8399 %0 = bitcast <16 x float> %__W to <8 x double> 8400 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8401 %1 = bitcast <4 x double> %extract.i to <8 x float> 8402 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8403 %2 = bitcast <4 x double> %extract2.i to <8 x float> 8404 %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2) 8405 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8406 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8407 %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i) 8408 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8409 %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i) 8410 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8411 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i) 8412 %vecext.i = extractelement <4 x float> %6, i32 0 8413 ret float %vecext.i 8414} 8415 8416define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) { 8417; CHECK-LABEL: test_mm512_reduce_min_epi32: 8418; CHECK: # %bb.0: # %entry 8419; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8420; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 8421; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8422; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8423; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8424; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8425; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8426; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8427; CHECK-NEXT: vmovd %xmm0, %eax 8428; CHECK-NEXT: vzeroupper 8429; CHECK-NEXT: ret{{[l|q]}} 8430entry: 8431 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8432 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8433 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8434 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8435 %2 = icmp slt <8 x i32> %0, %1 8436 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8437 %4 = bitcast <8 x i32> %3 to <4 x i64> 8438 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8439 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8440 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8441 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8442 %7 = icmp slt <4 x i32> %5, %6 8443 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8444 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8445 %9 = icmp slt <4 x i32> %8, %shuffle.i 8446 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8447 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8448 %11 = icmp slt <4 x i32> %10, %shuffle8.i 8449 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8450 %vecext.i = extractelement <4 x i32> %12, i32 0 8451 ret i32 %vecext.i 8452} 8453 8454define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) { 8455; CHECK-LABEL: test_mm512_reduce_min_epu32: 8456; CHECK: # %bb.0: # %entry 8457; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8458; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 8459; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8460; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 8461; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8462; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 8463; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8464; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 8465; CHECK-NEXT: vmovd %xmm0, %eax 8466; CHECK-NEXT: vzeroupper 8467; CHECK-NEXT: ret{{[l|q]}} 8468entry: 8469 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8470 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8471 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8472 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8473 %2 = icmp ult <8 x i32> %0, %1 8474 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8475 %4 = bitcast <8 x i32> %3 to <4 x i64> 8476 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8477 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8478 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8479 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8480 %7 = icmp ult <4 x i32> %5, %6 8481 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8482 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8483 %9 = icmp ult <4 x i32> %8, %shuffle.i 8484 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8485 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8486 %11 = icmp ult <4 x i32> %10, %shuffle8.i 8487 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8488 %vecext.i = extractelement <4 x i32> %12, i32 0 8489 ret i32 %vecext.i 8490} 8491 8492define float @test_mm512_reduce_min_ps(<16 x float> %__W) { 8493; X86-LABEL: test_mm512_reduce_min_ps: 8494; X86: # %bb.0: # %entry 8495; X86-NEXT: pushl %eax 8496; X86-NEXT: .cfi_def_cfa_offset 8 8497; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8498; X86-NEXT: vminps %ymm1, %ymm0, %ymm0 8499; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8500; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8501; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8502; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8503; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 8504; X86-NEXT: vminss %xmm1, %xmm0, %xmm0 8505; X86-NEXT: vmovss %xmm0, (%esp) 8506; X86-NEXT: flds (%esp) 8507; X86-NEXT: popl %eax 8508; X86-NEXT: .cfi_def_cfa_offset 4 8509; X86-NEXT: vzeroupper 8510; X86-NEXT: retl 8511; 8512; X64-LABEL: test_mm512_reduce_min_ps: 8513; X64: # %bb.0: # %entry 8514; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8515; X64-NEXT: vminps %ymm1, %ymm0, %ymm0 8516; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8517; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8518; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8519; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8520; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 8521; X64-NEXT: vminss %xmm1, %xmm0, %xmm0 8522; X64-NEXT: vzeroupper 8523; X64-NEXT: retq 8524entry: 8525 %0 = bitcast <16 x float> %__W to <8 x double> 8526 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8527 %1 = bitcast <4 x double> %extract.i to <8 x float> 8528 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8529 %2 = bitcast <4 x double> %extract2.i to <8 x float> 8530 %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2) 8531 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8532 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8533 %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i) 8534 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8535 %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i) 8536 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8537 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i) 8538 %vecext.i = extractelement <4 x float> %6, i32 0 8539 ret float %vecext.i 8540} 8541 8542define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) { 8543; X86-LABEL: test_mm512_mask_reduce_max_epi32: 8544; X86: # %bb.0: # %entry 8545; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 8546; X86-NEXT: kmovw %eax, %k1 8547; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 8548; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8549; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8550; X86-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 8551; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8552; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8553; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8554; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8555; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8556; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8557; X86-NEXT: vmovd %xmm0, %eax 8558; X86-NEXT: vzeroupper 8559; X86-NEXT: retl 8560; 8561; X64-LABEL: test_mm512_mask_reduce_max_epi32: 8562; X64: # %bb.0: # %entry 8563; X64-NEXT: kmovw %edi, %k1 8564; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 8565; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8566; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8567; X64-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 8568; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8569; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8570; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8571; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8572; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8573; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8574; X64-NEXT: vmovd %xmm0, %eax 8575; X64-NEXT: vzeroupper 8576; X64-NEXT: retq 8577entry: 8578 %0 = bitcast <8 x i64> %__W to <16 x i32> 8579 %1 = bitcast i16 %__M to <16 x i1> 8580 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 8581 %3 = bitcast <16 x i32> %2 to <8 x i64> 8582 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8583 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8584 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8585 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 8586 %6 = icmp sgt <8 x i32> %4, %5 8587 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8588 %8 = bitcast <8 x i32> %7 to <4 x i64> 8589 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8590 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8591 %9 = bitcast <2 x i64> %extract6.i to <4 x i32> 8592 %10 = bitcast <2 x i64> %extract7.i to <4 x i32> 8593 %11 = icmp sgt <4 x i32> %9, %10 8594 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8595 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8596 %13 = icmp sgt <4 x i32> %12, %shuffle.i 8597 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8598 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8599 %15 = icmp sgt <4 x i32> %14, %shuffle10.i 8600 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i 8601 %vecext.i = extractelement <4 x i32> %16, i32 0 8602 ret i32 %vecext.i 8603} 8604 8605define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) { 8606; X86-LABEL: test_mm512_mask_reduce_max_epu32: 8607; X86: # %bb.0: # %entry 8608; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 8609; X86-NEXT: kmovw %eax, %k1 8610; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 8611; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8612; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 8613; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8614; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8615; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8616; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8617; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8618; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8619; X86-NEXT: vmovd %xmm0, %eax 8620; X86-NEXT: vzeroupper 8621; X86-NEXT: retl 8622; 8623; X64-LABEL: test_mm512_mask_reduce_max_epu32: 8624; X64: # %bb.0: # %entry 8625; X64-NEXT: kmovw %edi, %k1 8626; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 8627; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8628; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 8629; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8630; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8631; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8632; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8633; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8634; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8635; X64-NEXT: vmovd %xmm0, %eax 8636; X64-NEXT: vzeroupper 8637; X64-NEXT: retq 8638entry: 8639 %0 = bitcast <8 x i64> %__W to <16 x i32> 8640 %1 = bitcast i16 %__M to <16 x i1> 8641 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 8642 %3 = bitcast <16 x i32> %2 to <8 x i64> 8643 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8644 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8645 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8646 %5 = bitcast <4 x i64> %extract3.i to <8 x i32> 8647 %6 = icmp ugt <8 x i32> %4, %5 8648 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8649 %8 = bitcast <8 x i32> %7 to <4 x i64> 8650 %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8651 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8652 %9 = bitcast <2 x i64> %extract5.i to <4 x i32> 8653 %10 = bitcast <2 x i64> %extract6.i to <4 x i32> 8654 %11 = icmp ugt <4 x i32> %9, %10 8655 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8656 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8657 %13 = icmp ugt <4 x i32> %12, %shuffle.i 8658 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8659 %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8660 %15 = icmp ugt <4 x i32> %14, %shuffle9.i 8661 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i 8662 %vecext.i = extractelement <4 x i32> %16, i32 0 8663 ret i32 %vecext.i 8664} 8665 8666define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) { 8667; X86-LABEL: test_mm512_mask_reduce_max_ps: 8668; X86: # %bb.0: # %entry 8669; X86-NEXT: pushl %eax 8670; X86-NEXT: .cfi_def_cfa_offset 8 8671; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 8672; X86-NEXT: kmovw %eax, %k1 8673; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 8674; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8675; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8676; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0 8677; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8678; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8679; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8680; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8681; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 8682; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 8683; X86-NEXT: vmovss %xmm0, (%esp) 8684; X86-NEXT: flds (%esp) 8685; X86-NEXT: popl %eax 8686; X86-NEXT: .cfi_def_cfa_offset 4 8687; X86-NEXT: vzeroupper 8688; X86-NEXT: retl 8689; 8690; X64-LABEL: test_mm512_mask_reduce_max_ps: 8691; X64: # %bb.0: # %entry 8692; X64-NEXT: kmovw %edi, %k1 8693; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 8694; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8695; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8696; X64-NEXT: vmaxps %ymm0, %ymm1, %ymm0 8697; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8698; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8699; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8700; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8701; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 8702; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0 8703; X64-NEXT: vzeroupper 8704; X64-NEXT: retq 8705entry: 8706 %0 = bitcast i16 %__M to <16 x i1> 8707 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000> 8708 %2 = bitcast <16 x float> %1 to <8 x double> 8709 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8710 %3 = bitcast <4 x double> %extract.i to <8 x float> 8711 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8712 %4 = bitcast <4 x double> %extract4.i to <8 x float> 8713 %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4) 8714 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8715 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8716 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i) 8717 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8718 %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i) 8719 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8720 %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i) 8721 %vecext.i = extractelement <4 x float> %8, i32 0 8722 ret float %vecext.i 8723} 8724 8725define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) { 8726; X86-LABEL: test_mm512_mask_reduce_min_epi32: 8727; X86: # %bb.0: # %entry 8728; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 8729; X86-NEXT: kmovw %eax, %k1 8730; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 8731; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8732; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8733; X86-NEXT: vpminsd %ymm0, %ymm1, %ymm0 8734; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8735; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8736; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8737; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8738; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8739; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8740; X86-NEXT: vmovd %xmm0, %eax 8741; X86-NEXT: vzeroupper 8742; X86-NEXT: retl 8743; 8744; X64-LABEL: test_mm512_mask_reduce_min_epi32: 8745; X64: # %bb.0: # %entry 8746; X64-NEXT: kmovw %edi, %k1 8747; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 8748; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8749; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8750; X64-NEXT: vpminsd %ymm0, %ymm1, %ymm0 8751; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8752; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8753; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8754; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8755; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8756; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8757; X64-NEXT: vmovd %xmm0, %eax 8758; X64-NEXT: vzeroupper 8759; X64-NEXT: retq 8760entry: 8761 %0 = bitcast <8 x i64> %__W to <16 x i32> 8762 %1 = bitcast i16 %__M to <16 x i1> 8763 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> 8764 %3 = bitcast <16 x i32> %2 to <8 x i64> 8765 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8766 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8767 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8768 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 8769 %6 = icmp slt <8 x i32> %4, %5 8770 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8771 %8 = bitcast <8 x i32> %7 to <4 x i64> 8772 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8773 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8774 %9 = bitcast <2 x i64> %extract6.i to <4 x i32> 8775 %10 = bitcast <2 x i64> %extract7.i to <4 x i32> 8776 %11 = icmp slt <4 x i32> %9, %10 8777 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8778 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8779 %13 = icmp slt <4 x i32> %12, %shuffle.i 8780 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8781 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8782 %15 = icmp slt <4 x i32> %14, %shuffle10.i 8783 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i 8784 %vecext.i = extractelement <4 x i32> %16, i32 0 8785 ret i32 %vecext.i 8786} 8787 8788define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) { 8789; X86-LABEL: test_mm512_mask_reduce_min_epu32: 8790; X86: # %bb.0: # %entry 8791; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 8792; X86-NEXT: kmovw %eax, %k1 8793; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8794; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8795; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8796; X86-NEXT: vpminud %ymm0, %ymm1, %ymm0 8797; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8798; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 8799; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8800; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 8801; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8802; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 8803; X86-NEXT: vmovd %xmm0, %eax 8804; X86-NEXT: vzeroupper 8805; X86-NEXT: retl 8806; 8807; X64-LABEL: test_mm512_mask_reduce_min_epu32: 8808; X64: # %bb.0: # %entry 8809; X64-NEXT: kmovw %edi, %k1 8810; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8811; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8812; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8813; X64-NEXT: vpminud %ymm0, %ymm1, %ymm0 8814; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8815; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 8816; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8817; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 8818; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8819; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 8820; X64-NEXT: vmovd %xmm0, %eax 8821; X64-NEXT: vzeroupper 8822; X64-NEXT: retq 8823entry: 8824 %0 = bitcast <8 x i64> %__W to <16 x i32> 8825 %1 = bitcast i16 %__M to <16 x i1> 8826 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 8827 %3 = bitcast <16 x i32> %2 to <8 x i64> 8828 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8829 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8830 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8831 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 8832 %6 = icmp ult <8 x i32> %4, %5 8833 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8834 %8 = bitcast <8 x i32> %7 to <4 x i64> 8835 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8836 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8837 %9 = bitcast <2 x i64> %extract6.i to <4 x i32> 8838 %10 = bitcast <2 x i64> %extract7.i to <4 x i32> 8839 %11 = icmp ult <4 x i32> %9, %10 8840 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8841 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8842 %13 = icmp ult <4 x i32> %12, %shuffle.i 8843 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8844 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8845 %15 = icmp ult <4 x i32> %14, %shuffle10.i 8846 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i 8847 %vecext.i = extractelement <4 x i32> %16, i32 0 8848 ret i32 %vecext.i 8849} 8850 8851define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) { 8852; X86-LABEL: test_mm512_mask_reduce_min_ps: 8853; X86: # %bb.0: # %entry 8854; X86-NEXT: pushl %eax 8855; X86-NEXT: .cfi_def_cfa_offset 8 8856; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 8857; X86-NEXT: kmovw %eax, %k1 8858; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8859; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8860; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8861; X86-NEXT: vminps %ymm0, %ymm1, %ymm0 8862; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8863; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8864; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8865; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8866; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 8867; X86-NEXT: vminss %xmm1, %xmm0, %xmm0 8868; X86-NEXT: vmovss %xmm0, (%esp) 8869; X86-NEXT: flds (%esp) 8870; X86-NEXT: popl %eax 8871; X86-NEXT: .cfi_def_cfa_offset 4 8872; X86-NEXT: vzeroupper 8873; X86-NEXT: retl 8874; 8875; X64-LABEL: test_mm512_mask_reduce_min_ps: 8876; X64: # %bb.0: # %entry 8877; X64-NEXT: kmovw %edi, %k1 8878; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8879; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8880; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8881; X64-NEXT: vminps %ymm0, %ymm1, %ymm0 8882; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8883; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8884; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8885; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8886; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 8887; X64-NEXT: vminss %xmm1, %xmm0, %xmm0 8888; X64-NEXT: vzeroupper 8889; X64-NEXT: retq 8890entry: 8891 %0 = bitcast i16 %__M to <16 x i1> 8892 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000> 8893 %2 = bitcast <16 x float> %1 to <8 x double> 8894 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8895 %3 = bitcast <4 x double> %extract.i to <8 x float> 8896 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8897 %4 = bitcast <4 x double> %extract4.i to <8 x float> 8898 %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4) 8899 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8900 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8901 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i) 8902 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8903 %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i) 8904 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8905 %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i) 8906 %vecext.i = extractelement <4 x float> %8, i32 0 8907 ret float %vecext.i 8908} 8909 8910define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8911; X86-LABEL: test_mm512_mask_max_pd: 8912; X86: # %bb.0: # %entry 8913; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8914; X86-NEXT: kmovw %eax, %k1 8915; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8916; X86-NEXT: retl 8917; 8918; X64-LABEL: test_mm512_mask_max_pd: 8919; X64: # %bb.0: # %entry 8920; X64-NEXT: kmovw %edi, %k1 8921; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8922; X64-NEXT: retq 8923entry: 8924 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8925 %1 = bitcast i8 %__U to <8 x i1> 8926 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 8927 ret <8 x double> %2 8928} 8929 8930define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8931; X86-LABEL: test_mm512_maskz_max_pd: 8932; X86: # %bb.0: # %entry 8933; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8934; X86-NEXT: kmovw %eax, %k1 8935; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8936; X86-NEXT: retl 8937; 8938; X64-LABEL: test_mm512_maskz_max_pd: 8939; X64: # %bb.0: # %entry 8940; X64-NEXT: kmovw %edi, %k1 8941; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8942; X64-NEXT: retq 8943entry: 8944 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8945 %1 = bitcast i8 %__U to <8 x i1> 8946 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 8947 ret <8 x double> %2 8948} 8949 8950define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 8951; X86-LABEL: test_mm512_mask_max_ps: 8952; X86: # %bb.0: # %entry 8953; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 8954; X86-NEXT: kmovw %eax, %k1 8955; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 8956; X86-NEXT: retl 8957; 8958; X64-LABEL: test_mm512_mask_max_ps: 8959; X64: # %bb.0: # %entry 8960; X64-NEXT: kmovw %edi, %k1 8961; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 8962; X64-NEXT: retq 8963entry: 8964 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8965 %1 = bitcast i16 %__U to <16 x i1> 8966 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 8967 ret <16 x float> %2 8968} 8969 8970define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8971; X86-LABEL: test_mm512_mask_max_round_pd: 8972; X86: # %bb.0: # %entry 8973; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8974; X86-NEXT: kmovw %eax, %k1 8975; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8976; X86-NEXT: retl 8977; 8978; X64-LABEL: test_mm512_mask_max_round_pd: 8979; X64: # %bb.0: # %entry 8980; X64-NEXT: kmovw %edi, %k1 8981; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8982; X64-NEXT: retq 8983entry: 8984 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8985 %1 = bitcast i8 %__U to <8 x i1> 8986 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 8987 ret <8 x double> %2 8988} 8989 8990declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) 8991 8992define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8993; X86-LABEL: test_mm512_maskz_max_round_pd: 8994; X86: # %bb.0: # %entry 8995; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8996; X86-NEXT: kmovw %eax, %k1 8997; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8998; X86-NEXT: retl 8999; 9000; X64-LABEL: test_mm512_maskz_max_round_pd: 9001; X64: # %bb.0: # %entry 9002; X64-NEXT: kmovw %edi, %k1 9003; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 9004; X64-NEXT: retq 9005entry: 9006 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9007 %1 = bitcast i8 %__U to <8 x i1> 9008 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9009 ret <8 x double> %2 9010} 9011 9012define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) { 9013; CHECK-LABEL: test_mm512_max_round_pd: 9014; CHECK: # %bb.0: # %entry 9015; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 9016; CHECK-NEXT: ret{{[l|q]}} 9017entry: 9018 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9019 ret <8 x double> %0 9020} 9021 9022define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9023; X86-LABEL: test_mm512_maskz_max_ps: 9024; X86: # %bb.0: # %entry 9025; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9026; X86-NEXT: kmovw %eax, %k1 9027; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 9028; X86-NEXT: retl 9029; 9030; X64-LABEL: test_mm512_maskz_max_ps: 9031; X64: # %bb.0: # %entry 9032; X64-NEXT: kmovw %edi, %k1 9033; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 9034; X64-NEXT: retq 9035entry: 9036 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9037 %1 = bitcast i16 %__U to <16 x i1> 9038 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9039 ret <16 x float> %2 9040} 9041 9042define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9043; X86-LABEL: test_mm512_mask_max_round_ps: 9044; X86: # %bb.0: # %entry 9045; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9046; X86-NEXT: kmovw %eax, %k1 9047; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 9048; X86-NEXT: retl 9049; 9050; X64-LABEL: test_mm512_mask_max_round_ps: 9051; X64: # %bb.0: # %entry 9052; X64-NEXT: kmovw %edi, %k1 9053; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 9054; X64-NEXT: retq 9055entry: 9056 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9057 %1 = bitcast i16 %__U to <16 x i1> 9058 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9059 ret <16 x float> %2 9060} 9061 9062declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) 9063 9064define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9065; X86-LABEL: test_mm512_maskz_max_round_ps: 9066; X86: # %bb.0: # %entry 9067; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9068; X86-NEXT: kmovw %eax, %k1 9069; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 9070; X86-NEXT: retl 9071; 9072; X64-LABEL: test_mm512_maskz_max_round_ps: 9073; X64: # %bb.0: # %entry 9074; X64-NEXT: kmovw %edi, %k1 9075; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 9076; X64-NEXT: retq 9077entry: 9078 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9079 %1 = bitcast i16 %__U to <16 x i1> 9080 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9081 ret <16 x float> %2 9082} 9083 9084define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) { 9085; CHECK-LABEL: test_mm512_max_round_ps: 9086; CHECK: # %bb.0: # %entry 9087; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0 9088; CHECK-NEXT: ret{{[l|q]}} 9089entry: 9090 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9091 ret <16 x float> %0 9092} 9093 9094define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 9095; X86-LABEL: test_mm512_mask_min_pd: 9096; X86: # %bb.0: # %entry 9097; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9098; X86-NEXT: kmovw %eax, %k1 9099; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 9100; X86-NEXT: retl 9101; 9102; X64-LABEL: test_mm512_mask_min_pd: 9103; X64: # %bb.0: # %entry 9104; X64-NEXT: kmovw %edi, %k1 9105; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 9106; X64-NEXT: retq 9107entry: 9108 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9109 %1 = bitcast i8 %__U to <8 x i1> 9110 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 9111 ret <8 x double> %2 9112} 9113 9114define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 9115; X86-LABEL: test_mm512_maskz_min_pd: 9116; X86: # %bb.0: # %entry 9117; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9118; X86-NEXT: kmovw %eax, %k1 9119; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 9120; X86-NEXT: retl 9121; 9122; X64-LABEL: test_mm512_maskz_min_pd: 9123; X64: # %bb.0: # %entry 9124; X64-NEXT: kmovw %edi, %k1 9125; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 9126; X64-NEXT: retq 9127entry: 9128 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9129 %1 = bitcast i8 %__U to <8 x i1> 9130 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9131 ret <8 x double> %2 9132} 9133 9134define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 9135; X86-LABEL: test_mm512_mask_min_round_pd: 9136; X86: # %bb.0: # %entry 9137; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9138; X86-NEXT: kmovw %eax, %k1 9139; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 9140; X86-NEXT: retl 9141; 9142; X64-LABEL: test_mm512_mask_min_round_pd: 9143; X64: # %bb.0: # %entry 9144; X64-NEXT: kmovw %edi, %k1 9145; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 9146; X64-NEXT: retq 9147entry: 9148 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9149 %1 = bitcast i8 %__U to <8 x i1> 9150 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 9151 ret <8 x double> %2 9152} 9153 9154declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) 9155 9156define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 9157; X86-LABEL: test_mm512_maskz_min_round_pd: 9158; X86: # %bb.0: # %entry 9159; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9160; X86-NEXT: kmovw %eax, %k1 9161; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 9162; X86-NEXT: retl 9163; 9164; X64-LABEL: test_mm512_maskz_min_round_pd: 9165; X64: # %bb.0: # %entry 9166; X64-NEXT: kmovw %edi, %k1 9167; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 9168; X64-NEXT: retq 9169entry: 9170 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9171 %1 = bitcast i8 %__U to <8 x i1> 9172 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9173 ret <8 x double> %2 9174} 9175 9176define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) { 9177; CHECK-LABEL: test_mm512_min_round_pd: 9178; CHECK: # %bb.0: # %entry 9179; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0 9180; CHECK-NEXT: ret{{[l|q]}} 9181entry: 9182 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9183 ret <8 x double> %0 9184} 9185 9186define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9187; X86-LABEL: test_mm512_mask_min_ps: 9188; X86: # %bb.0: # %entry 9189; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9190; X86-NEXT: kmovw %eax, %k1 9191; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9192; X86-NEXT: retl 9193; 9194; X64-LABEL: test_mm512_mask_min_ps: 9195; X64: # %bb.0: # %entry 9196; X64-NEXT: kmovw %edi, %k1 9197; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9198; X64-NEXT: retq 9199entry: 9200 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9201 %1 = bitcast i16 %__U to <16 x i1> 9202 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9203 ret <16 x float> %2 9204} 9205 9206define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9207; X86-LABEL: test_mm512_maskz_min_ps: 9208; X86: # %bb.0: # %entry 9209; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9210; X86-NEXT: kmovw %eax, %k1 9211; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9212; X86-NEXT: retl 9213; 9214; X64-LABEL: test_mm512_maskz_min_ps: 9215; X64: # %bb.0: # %entry 9216; X64-NEXT: kmovw %edi, %k1 9217; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9218; X64-NEXT: retq 9219entry: 9220 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9221 %1 = bitcast i16 %__U to <16 x i1> 9222 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9223 ret <16 x float> %2 9224} 9225 9226define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9227; X86-LABEL: test_mm512_mask_min_round_ps: 9228; X86: # %bb.0: # %entry 9229; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9230; X86-NEXT: kmovw %eax, %k1 9231; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9232; X86-NEXT: retl 9233; 9234; X64-LABEL: test_mm512_mask_min_round_ps: 9235; X64: # %bb.0: # %entry 9236; X64-NEXT: kmovw %edi, %k1 9237; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9238; X64-NEXT: retq 9239entry: 9240 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9241 %1 = bitcast i16 %__U to <16 x i1> 9242 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9243 ret <16 x float> %2 9244} 9245 9246declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) 9247 9248define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9249; X86-LABEL: test_mm512_maskz_min_round_ps: 9250; X86: # %bb.0: # %entry 9251; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9252; X86-NEXT: kmovw %eax, %k1 9253; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9254; X86-NEXT: retl 9255; 9256; X64-LABEL: test_mm512_maskz_min_round_ps: 9257; X64: # %bb.0: # %entry 9258; X64-NEXT: kmovw %edi, %k1 9259; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9260; X64-NEXT: retq 9261entry: 9262 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9263 %1 = bitcast i16 %__U to <16 x i1> 9264 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9265 ret <16 x float> %2 9266} 9267 9268define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) { 9269; CHECK-LABEL: test_mm512_min_round_ps: 9270; CHECK: # %bb.0: # %entry 9271; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0 9272; CHECK-NEXT: ret{{[l|q]}} 9273entry: 9274 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9275 ret <16 x float> %0 9276} 9277 9278define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) { 9279; CHECK-LABEL: test_mm512_sqrt_pd: 9280; CHECK: # %bb.0: # %entry 9281; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 9282; CHECK-NEXT: ret{{[l|q]}} 9283entry: 9284 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a) 9285 ret <8 x double> %0 9286} 9287 9288define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) { 9289; X86-LABEL: test_mm512_mask_sqrt_pd: 9290; X86: # %bb.0: # %entry 9291; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9292; X86-NEXT: kmovw %eax, %k1 9293; X86-NEXT: vsqrtpd %zmm1, %zmm0 {%k1} 9294; X86-NEXT: retl 9295; 9296; X64-LABEL: test_mm512_mask_sqrt_pd: 9297; X64: # %bb.0: # %entry 9298; X64-NEXT: kmovw %edi, %k1 9299; X64-NEXT: vsqrtpd %zmm1, %zmm0 {%k1} 9300; X64-NEXT: retq 9301entry: 9302 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A) 9303 %1 = bitcast i8 %__U to <8 x i1> 9304 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 9305 ret <8 x double> %2 9306} 9307 9308define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) { 9309; X86-LABEL: test_mm512_maskz_sqrt_pd: 9310; X86: # %bb.0: # %entry 9311; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9312; X86-NEXT: kmovw %eax, %k1 9313; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} 9314; X86-NEXT: retl 9315; 9316; X64-LABEL: test_mm512_maskz_sqrt_pd: 9317; X64: # %bb.0: # %entry 9318; X64-NEXT: kmovw %edi, %k1 9319; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} 9320; X64-NEXT: retq 9321entry: 9322 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A) 9323 %1 = bitcast i8 %__U to <8 x i1> 9324 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9325 ret <8 x double> %2 9326} 9327 9328define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) { 9329; X86-LABEL: test_mm512_mask_sqrt_round_pd: 9330; X86: # %bb.0: # %entry 9331; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9332; X86-NEXT: kmovw %eax, %k1 9333; X86-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1} 9334; X86-NEXT: retl 9335; 9336; X64-LABEL: test_mm512_mask_sqrt_round_pd: 9337; X64: # %bb.0: # %entry 9338; X64-NEXT: kmovw %edi, %k1 9339; X64-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1} 9340; X64-NEXT: retq 9341entry: 9342 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) 9343 %1 = bitcast i8 %__U to <8 x i1> 9344 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 9345 ret <8 x double> %2 9346} 9347 9348declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) 9349 9350define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) { 9351; X86-LABEL: test_mm512_maskz_sqrt_round_pd: 9352; X86: # %bb.0: # %entry 9353; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9354; X86-NEXT: kmovw %eax, %k1 9355; X86-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9356; X86-NEXT: retl 9357; 9358; X64-LABEL: test_mm512_maskz_sqrt_round_pd: 9359; X64: # %bb.0: # %entry 9360; X64-NEXT: kmovw %edi, %k1 9361; X64-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9362; X64-NEXT: retq 9363entry: 9364 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) 9365 %1 = bitcast i8 %__U to <8 x i1> 9366 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9367 ret <8 x double> %2 9368} 9369 9370define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) { 9371; CHECK-LABEL: test_mm512_sqrt_round_pd: 9372; CHECK: # %bb.0: # %entry 9373; CHECK-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 9374; CHECK-NEXT: ret{{[l|q]}} 9375entry: 9376 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) 9377 ret <8 x double> %0 9378} 9379 9380define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) { 9381; CHECK-LABEL: test_mm512_sqrt_ps: 9382; CHECK: # %bb.0: # %entry 9383; CHECK-NEXT: vsqrtps %zmm0, %zmm0 9384; CHECK-NEXT: ret{{[l|q]}} 9385entry: 9386 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a) 9387 ret <16 x float> %0 9388} 9389 9390define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) { 9391; X86-LABEL: test_mm512_mask_sqrt_ps: 9392; X86: # %bb.0: # %entry 9393; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9394; X86-NEXT: kmovw %eax, %k1 9395; X86-NEXT: vsqrtps %zmm1, %zmm0 {%k1} 9396; X86-NEXT: retl 9397; 9398; X64-LABEL: test_mm512_mask_sqrt_ps: 9399; X64: # %bb.0: # %entry 9400; X64-NEXT: kmovw %edi, %k1 9401; X64-NEXT: vsqrtps %zmm1, %zmm0 {%k1} 9402; X64-NEXT: retq 9403entry: 9404 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A) 9405 %1 = bitcast i16 %__U to <16 x i1> 9406 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9407 ret <16 x float> %2 9408} 9409 9410define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) { 9411; X86-LABEL: test_mm512_maskz_sqrt_ps: 9412; X86: # %bb.0: # %entry 9413; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9414; X86-NEXT: kmovw %eax, %k1 9415; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} 9416; X86-NEXT: retl 9417; 9418; X64-LABEL: test_mm512_maskz_sqrt_ps: 9419; X64: # %bb.0: # %entry 9420; X64-NEXT: kmovw %edi, %k1 9421; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} 9422; X64-NEXT: retq 9423entry: 9424 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A) 9425 %1 = bitcast i16 %__U to <16 x i1> 9426 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9427 ret <16 x float> %2 9428} 9429 9430define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) { 9431; X86-LABEL: test_mm512_mask_sqrt_round_ps: 9432; X86: # %bb.0: # %entry 9433; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9434; X86-NEXT: kmovw %eax, %k1 9435; X86-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1} 9436; X86-NEXT: retl 9437; 9438; X64-LABEL: test_mm512_mask_sqrt_round_ps: 9439; X64: # %bb.0: # %entry 9440; X64-NEXT: kmovw %edi, %k1 9441; X64-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1} 9442; X64-NEXT: retq 9443entry: 9444 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) 9445 %1 = bitcast i16 %__U to <16 x i1> 9446 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9447 ret <16 x float> %2 9448} 9449 9450declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) 9451 9452define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) { 9453; X86-LABEL: test_mm512_maskz_sqrt_round_ps: 9454; X86: # %bb.0: # %entry 9455; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9456; X86-NEXT: kmovw %eax, %k1 9457; X86-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9458; X86-NEXT: retl 9459; 9460; X64-LABEL: test_mm512_maskz_sqrt_round_ps: 9461; X64: # %bb.0: # %entry 9462; X64-NEXT: kmovw %edi, %k1 9463; X64-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9464; X64-NEXT: retq 9465entry: 9466 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) 9467 %1 = bitcast i16 %__U to <16 x i1> 9468 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9469 ret <16 x float> %2 9470} 9471 9472define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) { 9473; CHECK-LABEL: test_mm512_sqrt_round_ps: 9474; CHECK: # %bb.0: # %entry 9475; CHECK-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 9476; CHECK-NEXT: ret{{[l|q]}} 9477entry: 9478 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) 9479 ret <16 x float> %0 9480} 9481 9482define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 { 9483; CHECK-LABEL: test_mm512_rol_epi32: 9484; CHECK: # %bb.0: # %entry 9485; CHECK-NEXT: vprold $5, %zmm0, %zmm0 9486; CHECK-NEXT: ret{{[l|q]}} 9487entry: 9488 %0 = bitcast <8 x i64> %__A to <16 x i32> 9489 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 9490 %2 = bitcast <16 x i32> %1 to <8 x i64> 9491 ret <8 x i64> %2 9492} 9493 9494define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) { 9495; X86-LABEL: test_mm512_mask_rol_epi32: 9496; X86: # %bb.0: # %entry 9497; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9498; X86-NEXT: kmovw %eax, %k1 9499; X86-NEXT: vprold $5, %zmm1, %zmm0 {%k1} 9500; X86-NEXT: retl 9501; 9502; X64-LABEL: test_mm512_mask_rol_epi32: 9503; X64: # %bb.0: # %entry 9504; X64-NEXT: kmovw %edi, %k1 9505; X64-NEXT: vprold $5, %zmm1, %zmm0 {%k1} 9506; X64-NEXT: retq 9507entry: 9508 %0 = bitcast <8 x i64> %__A to <16 x i32> 9509 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 9510 %2 = bitcast <8 x i64> %__W to <16 x i32> 9511 %3 = bitcast i16 %__U to <16 x i1> 9512 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2 9513 %5 = bitcast <16 x i32> %4 to <8 x i64> 9514 ret <8 x i64> %5 9515} 9516 9517define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) { 9518; X86-LABEL: test_mm512_maskz_rol_epi32: 9519; X86: # %bb.0: # %entry 9520; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9521; X86-NEXT: kmovw %eax, %k1 9522; X86-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z} 9523; X86-NEXT: retl 9524; 9525; X64-LABEL: test_mm512_maskz_rol_epi32: 9526; X64: # %bb.0: # %entry 9527; X64-NEXT: kmovw %edi, %k1 9528; X64-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z} 9529; X64-NEXT: retq 9530entry: 9531 %0 = bitcast <8 x i64> %__A to <16 x i32> 9532 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 9533 %2 = bitcast i16 %__U to <16 x i1> 9534 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer 9535 %4 = bitcast <16 x i32> %3 to <8 x i64> 9536 ret <8 x i64> %4 9537} 9538 9539define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) { 9540; CHECK-LABEL: test_mm512_rol_epi64: 9541; CHECK: # %bb.0: # %entry 9542; CHECK-NEXT: vprolq $5, %zmm0, %zmm0 9543; CHECK-NEXT: ret{{[l|q]}} 9544entry: 9545 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>) 9546 ret <8 x i64> %0 9547} 9548 9549define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) { 9550; X86-LABEL: test_mm512_mask_rol_epi64: 9551; X86: # %bb.0: # %entry 9552; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9553; X86-NEXT: kmovw %eax, %k1 9554; X86-NEXT: vprolq $5, %zmm1, %zmm0 {%k1} 9555; X86-NEXT: retl 9556; 9557; X64-LABEL: test_mm512_mask_rol_epi64: 9558; X64: # %bb.0: # %entry 9559; X64-NEXT: kmovw %edi, %k1 9560; X64-NEXT: vprolq $5, %zmm1, %zmm0 {%k1} 9561; X64-NEXT: retq 9562entry: 9563 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>) 9564 %1 = bitcast i8 %__U to <8 x i1> 9565 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9566 ret <8 x i64> %2 9567} 9568 9569define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) { 9570; X86-LABEL: test_mm512_maskz_rol_epi64: 9571; X86: # %bb.0: # %entry 9572; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9573; X86-NEXT: kmovw %eax, %k1 9574; X86-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z} 9575; X86-NEXT: retl 9576; 9577; X64-LABEL: test_mm512_maskz_rol_epi64: 9578; X64: # %bb.0: # %entry 9579; X64-NEXT: kmovw %edi, %k1 9580; X64-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z} 9581; X64-NEXT: retq 9582entry: 9583 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>) 9584 %1 = bitcast i8 %__U to <8 x i1> 9585 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9586 ret <8 x i64> %2 9587} 9588 9589define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) { 9590; CHECK-LABEL: test_mm512_rolv_epi32: 9591; CHECK: # %bb.0: # %entry 9592; CHECK-NEXT: vprolvd %zmm1, %zmm0, %zmm0 9593; CHECK-NEXT: ret{{[l|q]}} 9594entry: 9595 %0 = bitcast <8 x i64> %__A to <16 x i32> 9596 %1 = bitcast <8 x i64> %__B to <16 x i32> 9597 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1) 9598 %3 = bitcast <16 x i32> %2 to <8 x i64> 9599 ret <8 x i64> %3 9600} 9601 9602define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9603; X86-LABEL: test_mm512_mask_rolv_epi32: 9604; X86: # %bb.0: # %entry 9605; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9606; X86-NEXT: kmovw %eax, %k1 9607; X86-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1} 9608; X86-NEXT: retl 9609; 9610; X64-LABEL: test_mm512_mask_rolv_epi32: 9611; X64: # %bb.0: # %entry 9612; X64-NEXT: kmovw %edi, %k1 9613; X64-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1} 9614; X64-NEXT: retq 9615entry: 9616 %0 = bitcast <8 x i64> %__A to <16 x i32> 9617 %1 = bitcast <8 x i64> %__B to <16 x i32> 9618 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1) 9619 %3 = bitcast <8 x i64> %__W to <16 x i32> 9620 %4 = bitcast i16 %__U to <16 x i1> 9621 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3 9622 %6 = bitcast <16 x i32> %5 to <8 x i64> 9623 ret <8 x i64> %6 9624} 9625 9626define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9627; X86-LABEL: test_mm512_maskz_rolv_epi32: 9628; X86: # %bb.0: # %entry 9629; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9630; X86-NEXT: kmovw %eax, %k1 9631; X86-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9632; X86-NEXT: retl 9633; 9634; X64-LABEL: test_mm512_maskz_rolv_epi32: 9635; X64: # %bb.0: # %entry 9636; X64-NEXT: kmovw %edi, %k1 9637; X64-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9638; X64-NEXT: retq 9639entry: 9640 %0 = bitcast <8 x i64> %__A to <16 x i32> 9641 %1 = bitcast <8 x i64> %__B to <16 x i32> 9642 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1) 9643 %3 = bitcast i16 %__U to <16 x i1> 9644 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 9645 %5 = bitcast <16 x i32> %4 to <8 x i64> 9646 ret <8 x i64> %5 9647} 9648 9649define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) { 9650; CHECK-LABEL: test_mm512_rolv_epi64: 9651; CHECK: # %bb.0: # %entry 9652; CHECK-NEXT: vprolvq %zmm1, %zmm0, %zmm0 9653; CHECK-NEXT: ret{{[l|q]}} 9654entry: 9655 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B) 9656 ret <8 x i64> %0 9657} 9658 9659define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9660; X86-LABEL: test_mm512_mask_rolv_epi64: 9661; X86: # %bb.0: # %entry 9662; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9663; X86-NEXT: kmovw %eax, %k1 9664; X86-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1} 9665; X86-NEXT: retl 9666; 9667; X64-LABEL: test_mm512_mask_rolv_epi64: 9668; X64: # %bb.0: # %entry 9669; X64-NEXT: kmovw %edi, %k1 9670; X64-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1} 9671; X64-NEXT: retq 9672entry: 9673 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B) 9674 %1 = bitcast i8 %__U to <8 x i1> 9675 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9676 ret <8 x i64> %2 9677} 9678 9679define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9680; X86-LABEL: test_mm512_maskz_rolv_epi64: 9681; X86: # %bb.0: # %entry 9682; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9683; X86-NEXT: kmovw %eax, %k1 9684; X86-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9685; X86-NEXT: retl 9686; 9687; X64-LABEL: test_mm512_maskz_rolv_epi64: 9688; X64: # %bb.0: # %entry 9689; X64-NEXT: kmovw %edi, %k1 9690; X64-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9691; X64-NEXT: retq 9692entry: 9693 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B) 9694 %1 = bitcast i8 %__U to <8 x i1> 9695 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9696 ret <8 x i64> %2 9697} 9698 9699define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) { 9700; CHECK-LABEL: test_mm512_ror_epi32: 9701; CHECK: # %bb.0: # %entry 9702; CHECK-NEXT: vprord $5, %zmm0, %zmm0 9703; CHECK-NEXT: ret{{[l|q]}} 9704entry: 9705 %0 = bitcast <8 x i64> %__A to <16 x i32> 9706 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 9707 %2 = bitcast <16 x i32> %1 to <8 x i64> 9708 ret <8 x i64> %2 9709} 9710 9711 9712define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) { 9713; X86-LABEL: test_mm512_mask_ror_epi32: 9714; X86: # %bb.0: # %entry 9715; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9716; X86-NEXT: kmovw %eax, %k1 9717; X86-NEXT: vprord $5, %zmm1, %zmm0 {%k1} 9718; X86-NEXT: retl 9719; 9720; X64-LABEL: test_mm512_mask_ror_epi32: 9721; X64: # %bb.0: # %entry 9722; X64-NEXT: kmovw %edi, %k1 9723; X64-NEXT: vprord $5, %zmm1, %zmm0 {%k1} 9724; X64-NEXT: retq 9725entry: 9726 %0 = bitcast <8 x i64> %__A to <16 x i32> 9727 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 9728 %2 = bitcast <8 x i64> %__W to <16 x i32> 9729 %3 = bitcast i16 %__U to <16 x i1> 9730 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2 9731 %5 = bitcast <16 x i32> %4 to <8 x i64> 9732 ret <8 x i64> %5 9733} 9734 9735define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) { 9736; X86-LABEL: test_mm512_maskz_ror_epi32: 9737; X86: # %bb.0: # %entry 9738; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9739; X86-NEXT: kmovw %eax, %k1 9740; X86-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z} 9741; X86-NEXT: retl 9742; 9743; X64-LABEL: test_mm512_maskz_ror_epi32: 9744; X64: # %bb.0: # %entry 9745; X64-NEXT: kmovw %edi, %k1 9746; X64-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z} 9747; X64-NEXT: retq 9748entry: 9749 %0 = bitcast <8 x i64> %__A to <16 x i32> 9750 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 9751 %2 = bitcast i16 %__U to <16 x i1> 9752 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer 9753 %4 = bitcast <16 x i32> %3 to <8 x i64> 9754 ret <8 x i64> %4 9755} 9756 9757define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) { 9758; CHECK-LABEL: test_mm512_ror_epi64: 9759; CHECK: # %bb.0: # %entry 9760; CHECK-NEXT: vprorq $5, %zmm0, %zmm0 9761; CHECK-NEXT: ret{{[l|q]}} 9762entry: 9763 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>) 9764 ret <8 x i64> %0 9765} 9766 9767define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) { 9768; X86-LABEL: test_mm512_mask_ror_epi64: 9769; X86: # %bb.0: # %entry 9770; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9771; X86-NEXT: kmovw %eax, %k1 9772; X86-NEXT: vprorq $5, %zmm1, %zmm0 {%k1} 9773; X86-NEXT: retl 9774; 9775; X64-LABEL: test_mm512_mask_ror_epi64: 9776; X64: # %bb.0: # %entry 9777; X64-NEXT: kmovw %edi, %k1 9778; X64-NEXT: vprorq $5, %zmm1, %zmm0 {%k1} 9779; X64-NEXT: retq 9780entry: 9781 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>) 9782 %1 = bitcast i8 %__U to <8 x i1> 9783 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9784 ret <8 x i64> %2 9785} 9786 9787define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) { 9788; X86-LABEL: test_mm512_maskz_ror_epi64: 9789; X86: # %bb.0: # %entry 9790; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9791; X86-NEXT: kmovw %eax, %k1 9792; X86-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z} 9793; X86-NEXT: retl 9794; 9795; X64-LABEL: test_mm512_maskz_ror_epi64: 9796; X64: # %bb.0: # %entry 9797; X64-NEXT: kmovw %edi, %k1 9798; X64-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z} 9799; X64-NEXT: retq 9800entry: 9801 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>) 9802 %1 = bitcast i8 %__U to <8 x i1> 9803 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9804 ret <8 x i64> %2 9805} 9806 9807define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) { 9808; CHECK-LABEL: test_mm512_rorv_epi32: 9809; CHECK: # %bb.0: # %entry 9810; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0 9811; CHECK-NEXT: ret{{[l|q]}} 9812entry: 9813 %0 = bitcast <8 x i64> %__A to <16 x i32> 9814 %1 = bitcast <8 x i64> %__B to <16 x i32> 9815 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1) 9816 %3 = bitcast <16 x i32> %2 to <8 x i64> 9817 ret <8 x i64> %3 9818} 9819 9820define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9821; X86-LABEL: test_mm512_mask_rorv_epi32: 9822; X86: # %bb.0: # %entry 9823; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9824; X86-NEXT: kmovw %eax, %k1 9825; X86-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1} 9826; X86-NEXT: retl 9827; 9828; X64-LABEL: test_mm512_mask_rorv_epi32: 9829; X64: # %bb.0: # %entry 9830; X64-NEXT: kmovw %edi, %k1 9831; X64-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1} 9832; X64-NEXT: retq 9833entry: 9834 %0 = bitcast <8 x i64> %__A to <16 x i32> 9835 %1 = bitcast <8 x i64> %__B to <16 x i32> 9836 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1) 9837 %3 = bitcast <8 x i64> %__W to <16 x i32> 9838 %4 = bitcast i16 %__U to <16 x i1> 9839 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3 9840 %6 = bitcast <16 x i32> %5 to <8 x i64> 9841 ret <8 x i64> %6 9842} 9843 9844define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9845; X86-LABEL: test_mm512_maskz_rorv_epi32: 9846; X86: # %bb.0: # %entry 9847; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 9848; X86-NEXT: kmovw %eax, %k1 9849; X86-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9850; X86-NEXT: retl 9851; 9852; X64-LABEL: test_mm512_maskz_rorv_epi32: 9853; X64: # %bb.0: # %entry 9854; X64-NEXT: kmovw %edi, %k1 9855; X64-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9856; X64-NEXT: retq 9857entry: 9858 %0 = bitcast <8 x i64> %__A to <16 x i32> 9859 %1 = bitcast <8 x i64> %__B to <16 x i32> 9860 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1) 9861 %3 = bitcast i16 %__U to <16 x i1> 9862 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 9863 %5 = bitcast <16 x i32> %4 to <8 x i64> 9864 ret <8 x i64> %5 9865} 9866 9867define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) { 9868; CHECK-LABEL: test_mm512_rorv_epi64: 9869; CHECK: # %bb.0: # %entry 9870; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0 9871; CHECK-NEXT: ret{{[l|q]}} 9872entry: 9873 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B) 9874 ret <8 x i64> %0 9875} 9876 9877define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9878; X86-LABEL: test_mm512_mask_rorv_epi64: 9879; X86: # %bb.0: # %entry 9880; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9881; X86-NEXT: kmovw %eax, %k1 9882; X86-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1} 9883; X86-NEXT: retl 9884; 9885; X64-LABEL: test_mm512_mask_rorv_epi64: 9886; X64: # %bb.0: # %entry 9887; X64-NEXT: kmovw %edi, %k1 9888; X64-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1} 9889; X64-NEXT: retq 9890entry: 9891 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B) 9892 %1 = bitcast i8 %__U to <8 x i1> 9893 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9894 ret <8 x i64> %2 9895} 9896 9897define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9898; X86-LABEL: test_mm512_maskz_rorv_epi64: 9899; X86: # %bb.0: # %entry 9900; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9901; X86-NEXT: kmovw %eax, %k1 9902; X86-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9903; X86-NEXT: retl 9904; 9905; X64-LABEL: test_mm512_maskz_rorv_epi64: 9906; X64: # %bb.0: # %entry 9907; X64-NEXT: kmovw %edi, %k1 9908; X64-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9909; X64-NEXT: retq 9910entry: 9911 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B) 9912 %1 = bitcast i8 %__U to <8 x i1> 9913 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9914 ret <8 x i64> %2 9915} 9916 9917declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9 9918declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9 9919declare float @llvm.fma.f32(float, float, float) #9 9920declare double @llvm.fma.f64(double, double, double) #9 9921declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>) 9922declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>) 9923declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10 9924declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>) 9925declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>) 9926declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>) 9927declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>) 9928declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>) 9929declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) 9930declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) 9931declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) 9932declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) 9933declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) 9934declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) 9935declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) 9936declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) 9937declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) 9938declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) 9939 9940declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) 9941declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) 9942declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) 9943declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) 9944 9945!0 = !{i32 1} 9946 9947