1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c 6 7define <4 x float> @test_mm_mask_cvtepi32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) { 8; X86-LABEL: test_mm_mask_cvtepi32_ps: 9; X86: # %bb.0: # %entry 10; X86-NEXT: movb {{[0-9]+}}(%esp), %al 11; X86-NEXT: kmovw %eax, %k1 12; X86-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1} 13; X86-NEXT: retl 14; 15; X64-LABEL: test_mm_mask_cvtepi32_ps: 16; X64: # %bb.0: # %entry 17; X64-NEXT: kmovw %edi, %k1 18; X64-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1} 19; X64-NEXT: retq 20entry: 21 %0 = bitcast <2 x i64> %__A to <4 x i32> 22 %conv.i.i = sitofp <4 x i32> %0 to <4 x float> 23 %1 = bitcast i8 %__U to <8 x i1> 24 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 25 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W 26 ret <4 x float> %2 27} 28 29define <4 x float> @test_mm_maskz_cvtepi32_ps(i8 zeroext %__U, <2 x i64> %__A) { 30; X86-LABEL: test_mm_maskz_cvtepi32_ps: 31; X86: # %bb.0: # %entry 32; X86-NEXT: movb {{[0-9]+}}(%esp), %al 33; X86-NEXT: kmovw %eax, %k1 34; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z} 35; X86-NEXT: retl 36; 37; X64-LABEL: test_mm_maskz_cvtepi32_ps: 38; X64: # %bb.0: # %entry 39; X64-NEXT: kmovw %edi, %k1 40; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z} 41; X64-NEXT: retq 42entry: 43 %0 = bitcast <2 x i64> %__A to <4 x i32> 44 %conv.i.i = sitofp <4 x i32> %0 to <4 x float> 45 %1 = bitcast i8 %__U to <8 x i1> 46 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 47 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer 48 ret <4 x float> %2 49} 50 51define <8 x float> @test_mm256_mask_cvtepi32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) { 52; X86-LABEL: test_mm256_mask_cvtepi32_ps: 53; X86: # %bb.0: # %entry 54; X86-NEXT: movb {{[0-9]+}}(%esp), %al 55; X86-NEXT: kmovw %eax, %k1 56; X86-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1} 57; X86-NEXT: retl 58; 59; X64-LABEL: test_mm256_mask_cvtepi32_ps: 60; X64: # %bb.0: # %entry 61; X64-NEXT: kmovw %edi, %k1 62; X64-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1} 63; X64-NEXT: retq 64entry: 65 %0 = bitcast <4 x i64> %__A to <8 x i32> 66 %conv.i.i = sitofp <8 x i32> %0 to <8 x float> 67 %1 = bitcast i8 %__U to <8 x i1> 68 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W 69 ret <8 x float> %2 70} 71 72define <8 x float> @test_mm256_maskz_cvtepi32_ps(i8 zeroext %__U, <4 x i64> %__A) { 73; X86-LABEL: test_mm256_maskz_cvtepi32_ps: 74; X86: # %bb.0: # %entry 75; X86-NEXT: movb {{[0-9]+}}(%esp), %al 76; X86-NEXT: kmovw %eax, %k1 77; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z} 78; X86-NEXT: retl 79; 80; X64-LABEL: test_mm256_maskz_cvtepi32_ps: 81; X64: # %bb.0: # %entry 82; X64-NEXT: kmovw %edi, %k1 83; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z} 84; X64-NEXT: retq 85entry: 86 %0 = bitcast <4 x i64> %__A to <8 x i32> 87 %conv.i.i = sitofp <8 x i32> %0 to <8 x float> 88 %1 = bitcast i8 %__U to <8 x i1> 89 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer 90 ret <8 x float> %2 91} 92 93define <2 x i64> @test_mm_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 94; X86-LABEL: test_mm_mask_cvtpd_epi32: 95; X86: # %bb.0: # %entry 96; X86-NEXT: movb {{[0-9]+}}(%esp), %al 97; X86-NEXT: kmovw %eax, %k1 98; X86-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1} 99; X86-NEXT: retl 100; 101; X64-LABEL: test_mm_mask_cvtpd_epi32: 102; X64: # %bb.0: # %entry 103; X64-NEXT: kmovw %edi, %k1 104; X64-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1} 105; X64-NEXT: retq 106entry: 107 %0 = bitcast <2 x i64> %__W to <4 x i32> 108 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 109 %2 = bitcast <4 x i32> %1 to <2 x i64> 110 ret <2 x i64> %2 111} 112 113define <2 x i64> @test_mm_maskz_cvtpd_epi32(i8 zeroext %__U, <2 x double> %__A) { 114; X86-LABEL: test_mm_maskz_cvtpd_epi32: 115; X86: # %bb.0: # %entry 116; X86-NEXT: movb {{[0-9]+}}(%esp), %al 117; X86-NEXT: kmovw %eax, %k1 118; X86-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z} 119; X86-NEXT: retl 120; 121; X64-LABEL: test_mm_maskz_cvtpd_epi32: 122; X64: # %bb.0: # %entry 123; X64-NEXT: kmovw %edi, %k1 124; X64-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z} 125; X64-NEXT: retq 126entry: 127 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 128 %1 = bitcast <4 x i32> %0 to <2 x i64> 129 ret <2 x i64> %1 130} 131 132define <2 x i64> @test_mm256_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 133; X86-LABEL: test_mm256_mask_cvtpd_epi32: 134; X86: # %bb.0: # %entry 135; X86-NEXT: movb {{[0-9]+}}(%esp), %al 136; X86-NEXT: kmovw %eax, %k1 137; X86-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1} 138; X86-NEXT: vzeroupper 139; X86-NEXT: retl 140; 141; X64-LABEL: test_mm256_mask_cvtpd_epi32: 142; X64: # %bb.0: # %entry 143; X64-NEXT: kmovw %edi, %k1 144; X64-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1} 145; X64-NEXT: vzeroupper 146; X64-NEXT: retq 147entry: 148 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8 149 %1 = bitcast <2 x i64> %__W to <4 x i32> 150 %2 = bitcast i8 %__U to <8 x i1> 151 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 152 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 153 %4 = bitcast <4 x i32> %3 to <2 x i64> 154 ret <2 x i64> %4 155} 156 157define <2 x i64> @test_mm256_maskz_cvtpd_epi32(i8 zeroext %__U, <4 x double> %__A) { 158; X86-LABEL: test_mm256_maskz_cvtpd_epi32: 159; X86: # %bb.0: # %entry 160; X86-NEXT: movb {{[0-9]+}}(%esp), %al 161; X86-NEXT: kmovw %eax, %k1 162; X86-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z} 163; X86-NEXT: vzeroupper 164; X86-NEXT: retl 165; 166; X64-LABEL: test_mm256_maskz_cvtpd_epi32: 167; X64: # %bb.0: # %entry 168; X64-NEXT: kmovw %edi, %k1 169; X64-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z} 170; X64-NEXT: vzeroupper 171; X64-NEXT: retq 172entry: 173 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8 174 %1 = bitcast i8 %__U to <8 x i1> 175 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 176 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 177 %3 = bitcast <4 x i32> %2 to <2 x i64> 178 ret <2 x i64> %3 179} 180 181define <4 x float> @test_mm_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <2 x double> %__A) { 182; X86-LABEL: test_mm_mask_cvtpd_ps: 183; X86: # %bb.0: # %entry 184; X86-NEXT: movb {{[0-9]+}}(%esp), %al 185; X86-NEXT: kmovw %eax, %k1 186; X86-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1} 187; X86-NEXT: retl 188; 189; X64-LABEL: test_mm_mask_cvtpd_ps: 190; X64: # %bb.0: # %entry 191; X64-NEXT: kmovw %edi, %k1 192; X64-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1} 193; X64-NEXT: retq 194entry: 195 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> %__W, i8 %__U) #8 196 ret <4 x float> %0 197} 198 199define <4 x float> @test_mm_maskz_cvtpd_ps(i8 zeroext %__U, <2 x double> %__A) { 200; X86-LABEL: test_mm_maskz_cvtpd_ps: 201; X86: # %bb.0: # %entry 202; X86-NEXT: movb {{[0-9]+}}(%esp), %al 203; X86-NEXT: kmovw %eax, %k1 204; X86-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z} 205; X86-NEXT: retl 206; 207; X64-LABEL: test_mm_maskz_cvtpd_ps: 208; X64: # %bb.0: # %entry 209; X64-NEXT: kmovw %edi, %k1 210; X64-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z} 211; X64-NEXT: retq 212entry: 213 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> zeroinitializer, i8 %__U) #8 214 ret <4 x float> %0 215} 216 217define <4 x float> @test_mm256_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <4 x double> %__A) { 218; X86-LABEL: test_mm256_mask_cvtpd_ps: 219; X86: # %bb.0: # %entry 220; X86-NEXT: movb {{[0-9]+}}(%esp), %al 221; X86-NEXT: kmovw %eax, %k1 222; X86-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1} 223; X86-NEXT: vzeroupper 224; X86-NEXT: retl 225; 226; X64-LABEL: test_mm256_mask_cvtpd_ps: 227; X64: # %bb.0: # %entry 228; X64-NEXT: kmovw %edi, %k1 229; X64-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1} 230; X64-NEXT: vzeroupper 231; X64-NEXT: retq 232entry: 233 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8 234 %1 = bitcast i8 %__U to <8 x i1> 235 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 236 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W 237 ret <4 x float> %2 238} 239 240define <4 x float> @test_mm256_maskz_cvtpd_ps(i8 zeroext %__U, <4 x double> %__A) { 241; X86-LABEL: test_mm256_maskz_cvtpd_ps: 242; X86: # %bb.0: # %entry 243; X86-NEXT: movb {{[0-9]+}}(%esp), %al 244; X86-NEXT: kmovw %eax, %k1 245; X86-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} 246; X86-NEXT: vzeroupper 247; X86-NEXT: retl 248; 249; X64-LABEL: test_mm256_maskz_cvtpd_ps: 250; X64: # %bb.0: # %entry 251; X64-NEXT: kmovw %edi, %k1 252; X64-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} 253; X64-NEXT: vzeroupper 254; X64-NEXT: retq 255entry: 256 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8 257 %1 = bitcast i8 %__U to <8 x i1> 258 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 259 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 260 ret <4 x float> %2 261} 262 263define <2 x i64> @test_mm_cvtpd_epu32(<2 x double> %__A) { 264; CHECK-LABEL: test_mm_cvtpd_epu32: 265; CHECK: # %bb.0: # %entry 266; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 267; CHECK-NEXT: ret{{[l|q]}} 268entry: 269 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 270 %1 = bitcast <4 x i32> %0 to <2 x i64> 271 ret <2 x i64> %1 272} 273 274define <2 x i64> @test_mm_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 275; X86-LABEL: test_mm_mask_cvtpd_epu32: 276; X86: # %bb.0: # %entry 277; X86-NEXT: movb {{[0-9]+}}(%esp), %al 278; X86-NEXT: kmovw %eax, %k1 279; X86-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1} 280; X86-NEXT: retl 281; 282; X64-LABEL: test_mm_mask_cvtpd_epu32: 283; X64: # %bb.0: # %entry 284; X64-NEXT: kmovw %edi, %k1 285; X64-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1} 286; X64-NEXT: retq 287entry: 288 %0 = bitcast <2 x i64> %__W to <4 x i32> 289 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 290 %2 = bitcast <4 x i32> %1 to <2 x i64> 291 ret <2 x i64> %2 292} 293 294define <2 x i64> @test_mm_maskz_cvtpd_epu32(i8 zeroext %__U, <2 x double> %__A) { 295; X86-LABEL: test_mm_maskz_cvtpd_epu32: 296; X86: # %bb.0: # %entry 297; X86-NEXT: movb {{[0-9]+}}(%esp), %al 298; X86-NEXT: kmovw %eax, %k1 299; X86-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z} 300; X86-NEXT: retl 301; 302; X64-LABEL: test_mm_maskz_cvtpd_epu32: 303; X64: # %bb.0: # %entry 304; X64-NEXT: kmovw %edi, %k1 305; X64-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z} 306; X64-NEXT: retq 307entry: 308 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 309 %1 = bitcast <4 x i32> %0 to <2 x i64> 310 ret <2 x i64> %1 311} 312 313define <2 x i64> @test_mm256_cvtpd_epu32(<4 x double> %__A) { 314; CHECK-LABEL: test_mm256_cvtpd_epu32: 315; CHECK: # %bb.0: # %entry 316; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0 317; CHECK-NEXT: vzeroupper 318; CHECK-NEXT: ret{{[l|q]}} 319entry: 320 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 321 %1 = bitcast <4 x i32> %0 to <2 x i64> 322 ret <2 x i64> %1 323} 324 325define <2 x i64> @test_mm256_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 326; X86-LABEL: test_mm256_mask_cvtpd_epu32: 327; X86: # %bb.0: # %entry 328; X86-NEXT: movb {{[0-9]+}}(%esp), %al 329; X86-NEXT: kmovw %eax, %k1 330; X86-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1} 331; X86-NEXT: vzeroupper 332; X86-NEXT: retl 333; 334; X64-LABEL: test_mm256_mask_cvtpd_epu32: 335; X64: # %bb.0: # %entry 336; X64-NEXT: kmovw %edi, %k1 337; X64-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1} 338; X64-NEXT: vzeroupper 339; X64-NEXT: retq 340entry: 341 %0 = bitcast <2 x i64> %__W to <4 x i32> 342 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8 343 %2 = bitcast <4 x i32> %1 to <2 x i64> 344 ret <2 x i64> %2 345} 346 347define <2 x i64> @test_mm256_maskz_cvtpd_epu32(i8 zeroext %__U, <4 x double> %__A) { 348; X86-LABEL: test_mm256_maskz_cvtpd_epu32: 349; X86: # %bb.0: # %entry 350; X86-NEXT: movb {{[0-9]+}}(%esp), %al 351; X86-NEXT: kmovw %eax, %k1 352; X86-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z} 353; X86-NEXT: vzeroupper 354; X86-NEXT: retl 355; 356; X64-LABEL: test_mm256_maskz_cvtpd_epu32: 357; X64: # %bb.0: # %entry 358; X64-NEXT: kmovw %edi, %k1 359; X64-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z} 360; X64-NEXT: vzeroupper 361; X64-NEXT: retq 362entry: 363 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 364 %1 = bitcast <4 x i32> %0 to <2 x i64> 365 ret <2 x i64> %1 366} 367 368define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 369; X86-LABEL: test_mm_mask_cvtps_epi32: 370; X86: # %bb.0: # %entry 371; X86-NEXT: movb {{[0-9]+}}(%esp), %al 372; X86-NEXT: kmovw %eax, %k1 373; X86-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1} 374; X86-NEXT: retl 375; 376; X64-LABEL: test_mm_mask_cvtps_epi32: 377; X64: # %bb.0: # %entry 378; X64-NEXT: kmovw %edi, %k1 379; X64-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1} 380; X64-NEXT: retq 381entry: 382 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8 383 %1 = bitcast <2 x i64> %__W to <4 x i32> 384 %2 = bitcast i8 %__U to <8 x i1> 385 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 386 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 387 %4 = bitcast <4 x i32> %3 to <2 x i64> 388 ret <2 x i64> %4 389} 390 391define <2 x i64> @test_mm_maskz_cvtps_epi32(i8 zeroext %__U, <4 x float> %__A) { 392; X86-LABEL: test_mm_maskz_cvtps_epi32: 393; X86: # %bb.0: # %entry 394; X86-NEXT: movb {{[0-9]+}}(%esp), %al 395; X86-NEXT: kmovw %eax, %k1 396; X86-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z} 397; X86-NEXT: retl 398; 399; X64-LABEL: test_mm_maskz_cvtps_epi32: 400; X64: # %bb.0: # %entry 401; X64-NEXT: kmovw %edi, %k1 402; X64-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z} 403; X64-NEXT: retq 404entry: 405 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8 406 %1 = bitcast i8 %__U to <8 x i1> 407 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 408 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 409 %3 = bitcast <4 x i32> %2 to <2 x i64> 410 ret <2 x i64> %3 411} 412 413define <4 x i64> @test_mm256_mask_cvtps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 414; X86-LABEL: test_mm256_mask_cvtps_epi32: 415; X86: # %bb.0: # %entry 416; X86-NEXT: movb {{[0-9]+}}(%esp), %al 417; X86-NEXT: kmovw %eax, %k1 418; X86-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1} 419; X86-NEXT: retl 420; 421; X64-LABEL: test_mm256_mask_cvtps_epi32: 422; X64: # %bb.0: # %entry 423; X64-NEXT: kmovw %edi, %k1 424; X64-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1} 425; X64-NEXT: retq 426entry: 427 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8 428 %1 = bitcast <4 x i64> %__W to <8 x i32> 429 %2 = bitcast i8 %__U to <8 x i1> 430 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 431 %4 = bitcast <8 x i32> %3 to <4 x i64> 432 ret <4 x i64> %4 433} 434 435define <4 x i64> @test_mm256_maskz_cvtps_epi32(i8 zeroext %__U, <8 x float> %__A) { 436; X86-LABEL: test_mm256_maskz_cvtps_epi32: 437; X86: # %bb.0: # %entry 438; X86-NEXT: movb {{[0-9]+}}(%esp), %al 439; X86-NEXT: kmovw %eax, %k1 440; X86-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z} 441; X86-NEXT: retl 442; 443; X64-LABEL: test_mm256_maskz_cvtps_epi32: 444; X64: # %bb.0: # %entry 445; X64-NEXT: kmovw %edi, %k1 446; X64-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z} 447; X64-NEXT: retq 448entry: 449 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8 450 %1 = bitcast i8 %__U to <8 x i1> 451 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer 452 %3 = bitcast <8 x i32> %2 to <4 x i64> 453 ret <4 x i64> %3 454} 455 456define <2 x double> @test_mm_mask_cvtps_pd(<2 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 457; X86-LABEL: test_mm_mask_cvtps_pd: 458; X86: # %bb.0: # %entry 459; X86-NEXT: movb {{[0-9]+}}(%esp), %al 460; X86-NEXT: kmovw %eax, %k1 461; X86-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1} 462; X86-NEXT: retl 463; 464; X64-LABEL: test_mm_mask_cvtps_pd: 465; X64: # %bb.0: # %entry 466; X64-NEXT: kmovw %edi, %k1 467; X64-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1} 468; X64-NEXT: retq 469entry: 470 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1> 471 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double> 472 %0 = bitcast i8 %__U to <8 x i1> 473 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 474 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W 475 ret <2 x double> %1 476} 477 478define <2 x double> @test_mm_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 479; X86-LABEL: test_mm_maskz_cvtps_pd: 480; X86: # %bb.0: # %entry 481; X86-NEXT: movb {{[0-9]+}}(%esp), %al 482; X86-NEXT: kmovw %eax, %k1 483; X86-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z} 484; X86-NEXT: retl 485; 486; X64-LABEL: test_mm_maskz_cvtps_pd: 487; X64: # %bb.0: # %entry 488; X64-NEXT: kmovw %edi, %k1 489; X64-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z} 490; X64-NEXT: retq 491entry: 492 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1> 493 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double> 494 %0 = bitcast i8 %__U to <8 x i1> 495 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 496 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer 497 ret <2 x double> %1 498} 499 500define <4 x double> @test_mm256_mask_cvtps_pd(<4 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 501; X86-LABEL: test_mm256_mask_cvtps_pd: 502; X86: # %bb.0: # %entry 503; X86-NEXT: movb {{[0-9]+}}(%esp), %al 504; X86-NEXT: kmovw %eax, %k1 505; X86-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1} 506; X86-NEXT: retl 507; 508; X64-LABEL: test_mm256_mask_cvtps_pd: 509; X64: # %bb.0: # %entry 510; X64-NEXT: kmovw %edi, %k1 511; X64-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1} 512; X64-NEXT: retq 513entry: 514 %conv.i.i = fpext <4 x float> %__A to <4 x double> 515 %0 = bitcast i8 %__U to <8 x i1> 516 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 517 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W 518 ret <4 x double> %1 519} 520 521define <4 x double> @test_mm256_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 522; X86-LABEL: test_mm256_maskz_cvtps_pd: 523; X86: # %bb.0: # %entry 524; X86-NEXT: movb {{[0-9]+}}(%esp), %al 525; X86-NEXT: kmovw %eax, %k1 526; X86-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} 527; X86-NEXT: retl 528; 529; X64-LABEL: test_mm256_maskz_cvtps_pd: 530; X64: # %bb.0: # %entry 531; X64-NEXT: kmovw %edi, %k1 532; X64-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} 533; X64-NEXT: retq 534entry: 535 %conv.i.i = fpext <4 x float> %__A to <4 x double> 536 %0 = bitcast i8 %__U to <8 x i1> 537 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 538 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer 539 ret <4 x double> %1 540} 541 542define <2 x i64> @test_mm_cvtps_epu32(<4 x float> %__A) { 543; CHECK-LABEL: test_mm_cvtps_epu32: 544; CHECK: # %bb.0: # %entry 545; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0 546; CHECK-NEXT: ret{{[l|q]}} 547entry: 548 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8 549 %1 = bitcast <4 x i32> %0 to <2 x i64> 550 ret <2 x i64> %1 551} 552 553define <2 x i64> @test_mm_mask_cvtps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 554; X86-LABEL: test_mm_mask_cvtps_epu32: 555; X86: # %bb.0: # %entry 556; X86-NEXT: movb {{[0-9]+}}(%esp), %al 557; X86-NEXT: kmovw %eax, %k1 558; X86-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1} 559; X86-NEXT: retl 560; 561; X64-LABEL: test_mm_mask_cvtps_epu32: 562; X64: # %bb.0: # %entry 563; X64-NEXT: kmovw %edi, %k1 564; X64-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1} 565; X64-NEXT: retq 566entry: 567 %0 = bitcast <2 x i64> %__W to <4 x i32> 568 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8 569 %2 = bitcast <4 x i32> %1 to <2 x i64> 570 ret <2 x i64> %2 571} 572 573define <2 x i64> @test_mm_maskz_cvtps_epu32(i8 zeroext %__U, <4 x float> %__A) { 574; X86-LABEL: test_mm_maskz_cvtps_epu32: 575; X86: # %bb.0: # %entry 576; X86-NEXT: movb {{[0-9]+}}(%esp), %al 577; X86-NEXT: kmovw %eax, %k1 578; X86-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z} 579; X86-NEXT: retl 580; 581; X64-LABEL: test_mm_maskz_cvtps_epu32: 582; X64: # %bb.0: # %entry 583; X64-NEXT: kmovw %edi, %k1 584; X64-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z} 585; X64-NEXT: retq 586entry: 587 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 588 %1 = bitcast <4 x i32> %0 to <2 x i64> 589 ret <2 x i64> %1 590} 591 592define <4 x i64> @test_mm256_cvtps_epu32(<8 x float> %__A) { 593; CHECK-LABEL: test_mm256_cvtps_epu32: 594; CHECK: # %bb.0: # %entry 595; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0 596; CHECK-NEXT: ret{{[l|q]}} 597entry: 598 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8 599 %1 = bitcast <8 x i32> %0 to <4 x i64> 600 ret <4 x i64> %1 601} 602 603define <4 x i64> @test_mm256_mask_cvtps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 604; X86-LABEL: test_mm256_mask_cvtps_epu32: 605; X86: # %bb.0: # %entry 606; X86-NEXT: movb {{[0-9]+}}(%esp), %al 607; X86-NEXT: kmovw %eax, %k1 608; X86-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1} 609; X86-NEXT: retl 610; 611; X64-LABEL: test_mm256_mask_cvtps_epu32: 612; X64: # %bb.0: # %entry 613; X64-NEXT: kmovw %edi, %k1 614; X64-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1} 615; X64-NEXT: retq 616entry: 617 %0 = bitcast <4 x i64> %__W to <8 x i32> 618 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8 619 %2 = bitcast <8 x i32> %1 to <4 x i64> 620 ret <4 x i64> %2 621} 622 623define <4 x i64> @test_mm256_maskz_cvtps_epu32(i8 zeroext %__U, <8 x float> %__A) { 624; X86-LABEL: test_mm256_maskz_cvtps_epu32: 625; X86: # %bb.0: # %entry 626; X86-NEXT: movb {{[0-9]+}}(%esp), %al 627; X86-NEXT: kmovw %eax, %k1 628; X86-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z} 629; X86-NEXT: retl 630; 631; X64-LABEL: test_mm256_maskz_cvtps_epu32: 632; X64: # %bb.0: # %entry 633; X64-NEXT: kmovw %edi, %k1 634; X64-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z} 635; X64-NEXT: retq 636entry: 637 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8 638 %1 = bitcast <8 x i32> %0 to <4 x i64> 639 ret <4 x i64> %1 640} 641 642define <2 x i64> @test_mm_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 643; X86-LABEL: test_mm_mask_cvttpd_epi32: 644; X86: # %bb.0: # %entry 645; X86-NEXT: movb {{[0-9]+}}(%esp), %al 646; X86-NEXT: kmovw %eax, %k1 647; X86-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1} 648; X86-NEXT: retl 649; 650; X64-LABEL: test_mm_mask_cvttpd_epi32: 651; X64: # %bb.0: # %entry 652; X64-NEXT: kmovw %edi, %k1 653; X64-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1} 654; X64-NEXT: retq 655entry: 656 %0 = bitcast <2 x i64> %__W to <4 x i32> 657 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 658 %2 = bitcast <4 x i32> %1 to <2 x i64> 659 ret <2 x i64> %2 660} 661 662define <2 x i64> @test_mm_maskz_cvttpd_epi32(i8 zeroext %__U, <2 x double> %__A) { 663; X86-LABEL: test_mm_maskz_cvttpd_epi32: 664; X86: # %bb.0: # %entry 665; X86-NEXT: movb {{[0-9]+}}(%esp), %al 666; X86-NEXT: kmovw %eax, %k1 667; X86-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z} 668; X86-NEXT: retl 669; 670; X64-LABEL: test_mm_maskz_cvttpd_epi32: 671; X64: # %bb.0: # %entry 672; X64-NEXT: kmovw %edi, %k1 673; X64-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z} 674; X64-NEXT: retq 675entry: 676 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 677 %1 = bitcast <4 x i32> %0 to <2 x i64> 678 ret <2 x i64> %1 679} 680 681define <2 x i64> @test_mm256_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 682; X86-LABEL: test_mm256_mask_cvttpd_epi32: 683; X86: # %bb.0: # %entry 684; X86-NEXT: movb {{[0-9]+}}(%esp), %al 685; X86-NEXT: kmovw %eax, %k1 686; X86-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} 687; X86-NEXT: vzeroupper 688; X86-NEXT: retl 689; 690; X64-LABEL: test_mm256_mask_cvttpd_epi32: 691; X64: # %bb.0: # %entry 692; X64-NEXT: kmovw %edi, %k1 693; X64-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} 694; X64-NEXT: vzeroupper 695; X64-NEXT: retq 696entry: 697 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8 698 %1 = bitcast <2 x i64> %__W to <4 x i32> 699 %2 = bitcast i8 %__U to <8 x i1> 700 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 701 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 702 %4 = bitcast <4 x i32> %3 to <2 x i64> 703 ret <2 x i64> %4 704} 705 706define <2 x i64> @test_mm256_maskz_cvttpd_epi32(i8 zeroext %__U, <4 x double> %__A) { 707; X86-LABEL: test_mm256_maskz_cvttpd_epi32: 708; X86: # %bb.0: # %entry 709; X86-NEXT: movb {{[0-9]+}}(%esp), %al 710; X86-NEXT: kmovw %eax, %k1 711; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z} 712; X86-NEXT: vzeroupper 713; X86-NEXT: retl 714; 715; X64-LABEL: test_mm256_maskz_cvttpd_epi32: 716; X64: # %bb.0: # %entry 717; X64-NEXT: kmovw %edi, %k1 718; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z} 719; X64-NEXT: vzeroupper 720; X64-NEXT: retq 721entry: 722 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8 723 %1 = bitcast i8 %__U to <8 x i1> 724 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 725 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 726 %3 = bitcast <4 x i32> %2 to <2 x i64> 727 ret <2 x i64> %3 728} 729 730define <2 x i64> @test_mm_cvttpd_epu32(<2 x double> %__A) { 731; CHECK-LABEL: test_mm_cvttpd_epu32: 732; CHECK: # %bb.0: # %entry 733; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 734; CHECK-NEXT: ret{{[l|q]}} 735entry: 736 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 737 %1 = bitcast <4 x i32> %0 to <2 x i64> 738 ret <2 x i64> %1 739} 740 741define <2 x i64> @test_mm_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 742; X86-LABEL: test_mm_mask_cvttpd_epu32: 743; X86: # %bb.0: # %entry 744; X86-NEXT: movb {{[0-9]+}}(%esp), %al 745; X86-NEXT: kmovw %eax, %k1 746; X86-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1} 747; X86-NEXT: retl 748; 749; X64-LABEL: test_mm_mask_cvttpd_epu32: 750; X64: # %bb.0: # %entry 751; X64-NEXT: kmovw %edi, %k1 752; X64-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1} 753; X64-NEXT: retq 754entry: 755 %0 = bitcast <2 x i64> %__W to <4 x i32> 756 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 757 %2 = bitcast <4 x i32> %1 to <2 x i64> 758 ret <2 x i64> %2 759} 760 761define <2 x i64> @test_mm_maskz_cvttpd_epu32(i8 zeroext %__U, <2 x double> %__A) { 762; X86-LABEL: test_mm_maskz_cvttpd_epu32: 763; X86: # %bb.0: # %entry 764; X86-NEXT: movb {{[0-9]+}}(%esp), %al 765; X86-NEXT: kmovw %eax, %k1 766; X86-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z} 767; X86-NEXT: retl 768; 769; X64-LABEL: test_mm_maskz_cvttpd_epu32: 770; X64: # %bb.0: # %entry 771; X64-NEXT: kmovw %edi, %k1 772; X64-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z} 773; X64-NEXT: retq 774entry: 775 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 776 %1 = bitcast <4 x i32> %0 to <2 x i64> 777 ret <2 x i64> %1 778} 779 780define <2 x i64> @test_mm256_cvttpd_epu32(<4 x double> %__A) { 781; CHECK-LABEL: test_mm256_cvttpd_epu32: 782; CHECK: # %bb.0: # %entry 783; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 784; CHECK-NEXT: vzeroupper 785; CHECK-NEXT: ret{{[l|q]}} 786entry: 787 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 788 %1 = bitcast <4 x i32> %0 to <2 x i64> 789 ret <2 x i64> %1 790} 791 792define <2 x i64> @test_mm256_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 793; X86-LABEL: test_mm256_mask_cvttpd_epu32: 794; X86: # %bb.0: # %entry 795; X86-NEXT: movb {{[0-9]+}}(%esp), %al 796; X86-NEXT: kmovw %eax, %k1 797; X86-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1} 798; X86-NEXT: vzeroupper 799; X86-NEXT: retl 800; 801; X64-LABEL: test_mm256_mask_cvttpd_epu32: 802; X64: # %bb.0: # %entry 803; X64-NEXT: kmovw %edi, %k1 804; X64-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1} 805; X64-NEXT: vzeroupper 806; X64-NEXT: retq 807entry: 808 %0 = bitcast <2 x i64> %__W to <4 x i32> 809 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8 810 %2 = bitcast <4 x i32> %1 to <2 x i64> 811 ret <2 x i64> %2 812} 813 814define <2 x i64> @test_mm256_maskz_cvttpd_epu32(i8 zeroext %__U, <4 x double> %__A) { 815; X86-LABEL: test_mm256_maskz_cvttpd_epu32: 816; X86: # %bb.0: # %entry 817; X86-NEXT: movb {{[0-9]+}}(%esp), %al 818; X86-NEXT: kmovw %eax, %k1 819; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z} 820; X86-NEXT: vzeroupper 821; X86-NEXT: retl 822; 823; X64-LABEL: test_mm256_maskz_cvttpd_epu32: 824; X64: # %bb.0: # %entry 825; X64-NEXT: kmovw %edi, %k1 826; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z} 827; X64-NEXT: vzeroupper 828; X64-NEXT: retq 829entry: 830 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 831 %1 = bitcast <4 x i32> %0 to <2 x i64> 832 ret <2 x i64> %1 833} 834 835define <2 x i64> @test_mm_mask_cvttps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 836; X86-LABEL: test_mm_mask_cvttps_epi32: 837; X86: # %bb.0: # %entry 838; X86-NEXT: movb {{[0-9]+}}(%esp), %al 839; X86-NEXT: kmovw %eax, %k1 840; X86-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} 841; X86-NEXT: retl 842; 843; X64-LABEL: test_mm_mask_cvttps_epi32: 844; X64: # %bb.0: # %entry 845; X64-NEXT: kmovw %edi, %k1 846; X64-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} 847; X64-NEXT: retq 848entry: 849 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8 850 %1 = bitcast <2 x i64> %__W to <4 x i32> 851 %2 = bitcast i8 %__U to <8 x i1> 852 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 853 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 854 %4 = bitcast <4 x i32> %3 to <2 x i64> 855 ret <2 x i64> %4 856} 857 858define <2 x i64> @test_mm_maskz_cvttps_epi32(i8 zeroext %__U, <4 x float> %__A) { 859; X86-LABEL: test_mm_maskz_cvttps_epi32: 860; X86: # %bb.0: # %entry 861; X86-NEXT: movb {{[0-9]+}}(%esp), %al 862; X86-NEXT: kmovw %eax, %k1 863; X86-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z} 864; X86-NEXT: retl 865; 866; X64-LABEL: test_mm_maskz_cvttps_epi32: 867; X64: # %bb.0: # %entry 868; X64-NEXT: kmovw %edi, %k1 869; X64-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z} 870; X64-NEXT: retq 871entry: 872 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8 873 %1 = bitcast i8 %__U to <8 x i1> 874 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 875 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 876 %3 = bitcast <4 x i32> %2 to <2 x i64> 877 ret <2 x i64> %3 878} 879 880define <4 x i64> @test_mm256_mask_cvttps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 881; X86-LABEL: test_mm256_mask_cvttps_epi32: 882; X86: # %bb.0: # %entry 883; X86-NEXT: movb {{[0-9]+}}(%esp), %al 884; X86-NEXT: kmovw %eax, %k1 885; X86-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1} 886; X86-NEXT: retl 887; 888; X64-LABEL: test_mm256_mask_cvttps_epi32: 889; X64: # %bb.0: # %entry 890; X64-NEXT: kmovw %edi, %k1 891; X64-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1} 892; X64-NEXT: retq 893entry: 894 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8 895 %1 = bitcast <4 x i64> %__W to <8 x i32> 896 %2 = bitcast i8 %__U to <8 x i1> 897 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 898 %4 = bitcast <8 x i32> %3 to <4 x i64> 899 ret <4 x i64> %4 900} 901 902define <4 x i64> @test_mm256_maskz_cvttps_epi32(i8 zeroext %__U, <8 x float> %__A) { 903; X86-LABEL: test_mm256_maskz_cvttps_epi32: 904; X86: # %bb.0: # %entry 905; X86-NEXT: movb {{[0-9]+}}(%esp), %al 906; X86-NEXT: kmovw %eax, %k1 907; X86-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z} 908; X86-NEXT: retl 909; 910; X64-LABEL: test_mm256_maskz_cvttps_epi32: 911; X64: # %bb.0: # %entry 912; X64-NEXT: kmovw %edi, %k1 913; X64-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z} 914; X64-NEXT: retq 915entry: 916 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8 917 %1 = bitcast i8 %__U to <8 x i1> 918 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer 919 %3 = bitcast <8 x i32> %2 to <4 x i64> 920 ret <4 x i64> %3 921} 922 923define <2 x i64> @test_mm_cvttps_epu32(<4 x float> %__A) { 924; CHECK-LABEL: test_mm_cvttps_epu32: 925; CHECK: # %bb.0: # %entry 926; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 927; CHECK-NEXT: ret{{[l|q]}} 928entry: 929 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8 930 %1 = bitcast <4 x i32> %0 to <2 x i64> 931 ret <2 x i64> %1 932} 933 934define <2 x i64> @test_mm_mask_cvttps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 935; X86-LABEL: test_mm_mask_cvttps_epu32: 936; X86: # %bb.0: # %entry 937; X86-NEXT: movb {{[0-9]+}}(%esp), %al 938; X86-NEXT: kmovw %eax, %k1 939; X86-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1} 940; X86-NEXT: retl 941; 942; X64-LABEL: test_mm_mask_cvttps_epu32: 943; X64: # %bb.0: # %entry 944; X64-NEXT: kmovw %edi, %k1 945; X64-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1} 946; X64-NEXT: retq 947entry: 948 %0 = bitcast <2 x i64> %__W to <4 x i32> 949 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8 950 %2 = bitcast <4 x i32> %1 to <2 x i64> 951 ret <2 x i64> %2 952} 953 954define <2 x i64> @test_mm_maskz_cvttps_epu32(i8 zeroext %__U, <4 x float> %__A) { 955; X86-LABEL: test_mm_maskz_cvttps_epu32: 956; X86: # %bb.0: # %entry 957; X86-NEXT: movb {{[0-9]+}}(%esp), %al 958; X86-NEXT: kmovw %eax, %k1 959; X86-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z} 960; X86-NEXT: retl 961; 962; X64-LABEL: test_mm_maskz_cvttps_epu32: 963; X64: # %bb.0: # %entry 964; X64-NEXT: kmovw %edi, %k1 965; X64-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z} 966; X64-NEXT: retq 967entry: 968 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 969 %1 = bitcast <4 x i32> %0 to <2 x i64> 970 ret <2 x i64> %1 971} 972 973define <4 x i64> @test_mm256_cvttps_epu32(<8 x float> %__A) { 974; CHECK-LABEL: test_mm256_cvttps_epu32: 975; CHECK: # %bb.0: # %entry 976; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 977; CHECK-NEXT: ret{{[l|q]}} 978entry: 979 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8 980 %1 = bitcast <8 x i32> %0 to <4 x i64> 981 ret <4 x i64> %1 982} 983 984define <4 x i64> @test_mm256_mask_cvttps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 985; X86-LABEL: test_mm256_mask_cvttps_epu32: 986; X86: # %bb.0: # %entry 987; X86-NEXT: movb {{[0-9]+}}(%esp), %al 988; X86-NEXT: kmovw %eax, %k1 989; X86-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1} 990; X86-NEXT: retl 991; 992; X64-LABEL: test_mm256_mask_cvttps_epu32: 993; X64: # %bb.0: # %entry 994; X64-NEXT: kmovw %edi, %k1 995; X64-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1} 996; X64-NEXT: retq 997entry: 998 %0 = bitcast <4 x i64> %__W to <8 x i32> 999 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8 1000 %2 = bitcast <8 x i32> %1 to <4 x i64> 1001 ret <4 x i64> %2 1002} 1003 1004define <4 x i64> @test_mm256_maskz_cvttps_epu32(i8 zeroext %__U, <8 x float> %__A) { 1005; X86-LABEL: test_mm256_maskz_cvttps_epu32: 1006; X86: # %bb.0: # %entry 1007; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1008; X86-NEXT: kmovw %eax, %k1 1009; X86-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z} 1010; X86-NEXT: retl 1011; 1012; X64-LABEL: test_mm256_maskz_cvttps_epu32: 1013; X64: # %bb.0: # %entry 1014; X64-NEXT: kmovw %edi, %k1 1015; X64-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z} 1016; X64-NEXT: retq 1017entry: 1018 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8 1019 %1 = bitcast <8 x i32> %0 to <4 x i64> 1020 ret <4 x i64> %1 1021} 1022 1023define <2 x double> @test_mm_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 { 1024; CHECK-LABEL: test_mm_cvtepu32_pd: 1025; CHECK: # %bb.0: # %entry 1026; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0 1027; CHECK-NEXT: ret{{[l|q]}} 1028entry: 1029 %0 = bitcast <2 x i64> %__A to <4 x i32> 1030 %shuffle.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1031 %conv.i = uitofp <2 x i32> %shuffle.i to <2 x double> 1032 ret <2 x double> %conv.i 1033} 1034 1035define <2 x double> @test_mm_mask_cvtepu32_pd(<2 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1036; X86-LABEL: test_mm_mask_cvtepu32_pd: 1037; X86: # %bb.0: # %entry 1038; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1039; X86-NEXT: kmovw %eax, %k1 1040; X86-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1} 1041; X86-NEXT: retl 1042; 1043; X64-LABEL: test_mm_mask_cvtepu32_pd: 1044; X64: # %bb.0: # %entry 1045; X64-NEXT: kmovw %edi, %k1 1046; X64-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1} 1047; X64-NEXT: retq 1048entry: 1049 %0 = bitcast <2 x i64> %__A to <4 x i32> 1050 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1051 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double> 1052 %1 = bitcast i8 %__U to <8 x i1> 1053 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1054 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W 1055 ret <2 x double> %2 1056} 1057 1058define <2 x double> @test_mm_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1059; X86-LABEL: test_mm_maskz_cvtepu32_pd: 1060; X86: # %bb.0: # %entry 1061; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1062; X86-NEXT: kmovw %eax, %k1 1063; X86-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z} 1064; X86-NEXT: retl 1065; 1066; X64-LABEL: test_mm_maskz_cvtepu32_pd: 1067; X64: # %bb.0: # %entry 1068; X64-NEXT: kmovw %edi, %k1 1069; X64-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z} 1070; X64-NEXT: retq 1071entry: 1072 %0 = bitcast <2 x i64> %__A to <4 x i32> 1073 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1074 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double> 1075 %1 = bitcast i8 %__U to <8 x i1> 1076 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1077 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer 1078 ret <2 x double> %2 1079} 1080 1081define <4 x double> @test_mm256_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 { 1082; CHECK-LABEL: test_mm256_cvtepu32_pd: 1083; CHECK: # %bb.0: # %entry 1084; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0 1085; CHECK-NEXT: ret{{[l|q]}} 1086entry: 1087 %0 = bitcast <2 x i64> %__A to <4 x i32> 1088 %conv.i = uitofp <4 x i32> %0 to <4 x double> 1089 ret <4 x double> %conv.i 1090} 1091 1092define <4 x double> @test_mm256_mask_cvtepu32_pd(<4 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1093; X86-LABEL: test_mm256_mask_cvtepu32_pd: 1094; X86: # %bb.0: # %entry 1095; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1096; X86-NEXT: kmovw %eax, %k1 1097; X86-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1} 1098; X86-NEXT: retl 1099; 1100; X64-LABEL: test_mm256_mask_cvtepu32_pd: 1101; X64: # %bb.0: # %entry 1102; X64-NEXT: kmovw %edi, %k1 1103; X64-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1} 1104; X64-NEXT: retq 1105entry: 1106 %0 = bitcast <2 x i64> %__A to <4 x i32> 1107 %conv.i.i = uitofp <4 x i32> %0 to <4 x double> 1108 %1 = bitcast i8 %__U to <8 x i1> 1109 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1110 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W 1111 ret <4 x double> %2 1112} 1113 1114define <4 x double> @test_mm256_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1115; X86-LABEL: test_mm256_maskz_cvtepu32_pd: 1116; X86: # %bb.0: # %entry 1117; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1118; X86-NEXT: kmovw %eax, %k1 1119; X86-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z} 1120; X86-NEXT: retl 1121; 1122; X64-LABEL: test_mm256_maskz_cvtepu32_pd: 1123; X64: # %bb.0: # %entry 1124; X64-NEXT: kmovw %edi, %k1 1125; X64-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z} 1126; X64-NEXT: retq 1127entry: 1128 %0 = bitcast <2 x i64> %__A to <4 x i32> 1129 %conv.i.i = uitofp <4 x i32> %0 to <4 x double> 1130 %1 = bitcast i8 %__U to <8 x i1> 1131 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1132 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer 1133 ret <4 x double> %2 1134} 1135 1136define <4 x float> @test_mm_cvtepu32_ps(<2 x i64> %__A) { 1137; CHECK-LABEL: test_mm_cvtepu32_ps: 1138; CHECK: # %bb.0: # %entry 1139; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 1140; CHECK-NEXT: ret{{[l|q]}} 1141entry: 1142 %0 = bitcast <2 x i64> %__A to <4 x i32> 1143 %conv.i = uitofp <4 x i32> %0 to <4 x float> 1144 ret <4 x float> %conv.i 1145} 1146 1147define <4 x float> @test_mm_mask_cvtepu32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) { 1148; X86-LABEL: test_mm_mask_cvtepu32_ps: 1149; X86: # %bb.0: # %entry 1150; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1151; X86-NEXT: kmovw %eax, %k1 1152; X86-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1} 1153; X86-NEXT: retl 1154; 1155; X64-LABEL: test_mm_mask_cvtepu32_ps: 1156; X64: # %bb.0: # %entry 1157; X64-NEXT: kmovw %edi, %k1 1158; X64-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1} 1159; X64-NEXT: retq 1160entry: 1161 %0 = bitcast <2 x i64> %__A to <4 x i32> 1162 %conv.i.i = uitofp <4 x i32> %0 to <4 x float> 1163 %1 = bitcast i8 %__U to <8 x i1> 1164 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1165 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W 1166 ret <4 x float> %2 1167} 1168 1169define <4 x float> @test_mm_maskz_cvtepu32_ps(i8 zeroext %__U, <2 x i64> %__A) { 1170; X86-LABEL: test_mm_maskz_cvtepu32_ps: 1171; X86: # %bb.0: # %entry 1172; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1173; X86-NEXT: kmovw %eax, %k1 1174; X86-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z} 1175; X86-NEXT: retl 1176; 1177; X64-LABEL: test_mm_maskz_cvtepu32_ps: 1178; X64: # %bb.0: # %entry 1179; X64-NEXT: kmovw %edi, %k1 1180; X64-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z} 1181; X64-NEXT: retq 1182entry: 1183 %0 = bitcast <2 x i64> %__A to <4 x i32> 1184 %conv.i.i = uitofp <4 x i32> %0 to <4 x float> 1185 %1 = bitcast i8 %__U to <8 x i1> 1186 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1187 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer 1188 ret <4 x float> %2 1189} 1190 1191define <8 x float> @test_mm256_cvtepu32_ps(<4 x i64> %__A) { 1192; CHECK-LABEL: test_mm256_cvtepu32_ps: 1193; CHECK: # %bb.0: # %entry 1194; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 1195; CHECK-NEXT: ret{{[l|q]}} 1196entry: 1197 %0 = bitcast <4 x i64> %__A to <8 x i32> 1198 %conv.i = uitofp <8 x i32> %0 to <8 x float> 1199 ret <8 x float> %conv.i 1200} 1201 1202define <8 x float> @test_mm256_mask_cvtepu32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) { 1203; X86-LABEL: test_mm256_mask_cvtepu32_ps: 1204; X86: # %bb.0: # %entry 1205; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1206; X86-NEXT: kmovw %eax, %k1 1207; X86-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1} 1208; X86-NEXT: retl 1209; 1210; X64-LABEL: test_mm256_mask_cvtepu32_ps: 1211; X64: # %bb.0: # %entry 1212; X64-NEXT: kmovw %edi, %k1 1213; X64-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1} 1214; X64-NEXT: retq 1215entry: 1216 %0 = bitcast <4 x i64> %__A to <8 x i32> 1217 %conv.i.i = uitofp <8 x i32> %0 to <8 x float> 1218 %1 = bitcast i8 %__U to <8 x i1> 1219 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W 1220 ret <8 x float> %2 1221} 1222 1223define <8 x float> @test_mm256_maskz_cvtepu32_ps(i8 zeroext %__U, <4 x i64> %__A) { 1224; X86-LABEL: test_mm256_maskz_cvtepu32_ps: 1225; X86: # %bb.0: # %entry 1226; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1227; X86-NEXT: kmovw %eax, %k1 1228; X86-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z} 1229; X86-NEXT: retl 1230; 1231; X64-LABEL: test_mm256_maskz_cvtepu32_ps: 1232; X64: # %bb.0: # %entry 1233; X64-NEXT: kmovw %edi, %k1 1234; X64-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z} 1235; X64-NEXT: retq 1236entry: 1237 %0 = bitcast <4 x i64> %__A to <8 x i32> 1238 %conv.i.i = uitofp <8 x i32> %0 to <8 x float> 1239 %1 = bitcast i8 %__U to <8 x i1> 1240 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer 1241 ret <8 x float> %2 1242} 1243 1244define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) { 1245; CHECK-LABEL: test_mm256_shuffle_f32x4: 1246; CHECK: # %bb.0: # %entry 1247; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1248; CHECK-NEXT: ret{{[l|q]}} 1249entry: 1250 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1251 ret <8 x float> %shuffle 1252} 1253 1254define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { 1255; X86-LABEL: test_mm256_mask_shuffle_f32x4: 1256; X86: # %bb.0: # %entry 1257; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1258; X86-NEXT: kmovw %eax, %k1 1259; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1260; X86-NEXT: retl 1261; 1262; X64-LABEL: test_mm256_mask_shuffle_f32x4: 1263; X64: # %bb.0: # %entry 1264; X64-NEXT: kmovw %edi, %k1 1265; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1266; X64-NEXT: retq 1267entry: 1268 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1269 %0 = bitcast i8 %__U to <8 x i1> 1270 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W 1271 ret <8 x float> %1 1272} 1273 1274define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { 1275; X86-LABEL: test_mm256_maskz_shuffle_f32x4: 1276; X86: # %bb.0: # %entry 1277; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1278; X86-NEXT: kmovw %eax, %k1 1279; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1280; X86-NEXT: retl 1281; 1282; X64-LABEL: test_mm256_maskz_shuffle_f32x4: 1283; X64: # %bb.0: # %entry 1284; X64-NEXT: kmovw %edi, %k1 1285; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1286; X64-NEXT: retq 1287entry: 1288 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1289 %0 = bitcast i8 %__U to <8 x i1> 1290 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer 1291 ret <8 x float> %1 1292} 1293 1294define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) { 1295; CHECK-LABEL: test_mm256_shuffle_f64x2: 1296; CHECK: # %bb.0: # %entry 1297; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1298; CHECK-NEXT: ret{{[l|q]}} 1299entry: 1300 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1301 ret <4 x double> %shuffle 1302} 1303 1304define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 1305; X86-LABEL: test_mm256_mask_shuffle_f64x2: 1306; X86: # %bb.0: # %entry 1307; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1308; X86-NEXT: kmovw %eax, %k1 1309; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1310; X86-NEXT: retl 1311; 1312; X64-LABEL: test_mm256_mask_shuffle_f64x2: 1313; X64: # %bb.0: # %entry 1314; X64-NEXT: kmovw %edi, %k1 1315; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1316; X64-NEXT: retq 1317entry: 1318 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1319 %0 = bitcast i8 %__U to <8 x i1> 1320 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1321 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W 1322 ret <4 x double> %1 1323} 1324 1325define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 1326; X86-LABEL: test_mm256_maskz_shuffle_f64x2: 1327; X86: # %bb.0: # %entry 1328; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1329; X86-NEXT: kmovw %eax, %k1 1330; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1331; X86-NEXT: retl 1332; 1333; X64-LABEL: test_mm256_maskz_shuffle_f64x2: 1334; X64: # %bb.0: # %entry 1335; X64-NEXT: kmovw %edi, %k1 1336; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1337; X64-NEXT: retq 1338entry: 1339 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1340 %0 = bitcast i8 %__U to <8 x i1> 1341 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1342 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer 1343 ret <4 x double> %1 1344} 1345 1346define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) { 1347; CHECK-LABEL: test_mm256_shuffle_i32x4: 1348; CHECK: # %bb.0: # %entry 1349; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1350; CHECK-NEXT: ret{{[l|q]}} 1351entry: 1352 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1353 ret <4 x i64> %shuffle 1354} 1355 1356define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1357; X86-LABEL: test_mm256_mask_shuffle_i32x4: 1358; X86: # %bb.0: # %entry 1359; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1360; X86-NEXT: kmovw %eax, %k1 1361; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1362; X86-NEXT: retl 1363; 1364; X64-LABEL: test_mm256_mask_shuffle_i32x4: 1365; X64: # %bb.0: # %entry 1366; X64-NEXT: kmovw %edi, %k1 1367; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1368; X64-NEXT: retq 1369entry: 1370 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1371 %0 = bitcast <4 x i64> %shuffle to <8 x i32> 1372 %1 = bitcast <4 x i64> %__W to <8 x i32> 1373 %2 = bitcast i8 %__U to <8 x i1> 1374 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 1375 %4 = bitcast <8 x i32> %3 to <4 x i64> 1376 ret <4 x i64> %4 1377} 1378 1379define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1380; X86-LABEL: test_mm256_maskz_shuffle_i32x4: 1381; X86: # %bb.0: # %entry 1382; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1383; X86-NEXT: kmovw %eax, %k1 1384; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1385; X86-NEXT: retl 1386; 1387; X64-LABEL: test_mm256_maskz_shuffle_i32x4: 1388; X64: # %bb.0: # %entry 1389; X64-NEXT: kmovw %edi, %k1 1390; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1391; X64-NEXT: retq 1392entry: 1393 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1394 %0 = bitcast <4 x i64> %shuffle to <8 x i32> 1395 %1 = bitcast i8 %__U to <8 x i1> 1396 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer 1397 %3 = bitcast <8 x i32> %2 to <4 x i64> 1398 ret <4 x i64> %3 1399} 1400 1401define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) { 1402; CHECK-LABEL: test_mm256_shuffle_i64x2: 1403; CHECK: # %bb.0: # %entry 1404; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1405; CHECK-NEXT: ret{{[l|q]}} 1406entry: 1407 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1408 ret <4 x i64> %shuffle 1409} 1410 1411define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1412; X86-LABEL: test_mm256_mask_shuffle_i64x2: 1413; X86: # %bb.0: # %entry 1414; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1415; X86-NEXT: kmovw %eax, %k1 1416; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1417; X86-NEXT: retl 1418; 1419; X64-LABEL: test_mm256_mask_shuffle_i64x2: 1420; X64: # %bb.0: # %entry 1421; X64-NEXT: kmovw %edi, %k1 1422; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1423; X64-NEXT: retq 1424entry: 1425 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1426 %0 = bitcast i8 %__U to <8 x i1> 1427 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1428 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W 1429 ret <4 x i64> %1 1430} 1431 1432define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1433; X86-LABEL: test_mm256_maskz_shuffle_i64x2: 1434; X86: # %bb.0: # %entry 1435; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1436; X86-NEXT: kmovw %eax, %k1 1437; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1438; X86-NEXT: retl 1439; 1440; X64-LABEL: test_mm256_maskz_shuffle_i64x2: 1441; X64: # %bb.0: # %entry 1442; X64-NEXT: kmovw %edi, %k1 1443; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1444; X64-NEXT: retq 1445entry: 1446 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1447 %0 = bitcast i8 %__U to <8 x i1> 1448 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1449 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer 1450 ret <4 x i64> %1 1451} 1452 1453define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) { 1454; CHECK-LABEL: test_mm_test_epi32_mask: 1455; CHECK: # %bb.0: # %entry 1456; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k0 1457; CHECK-NEXT: kmovw %k0, %eax 1458; CHECK-NEXT: movzbl %al, %eax 1459; CHECK-NEXT: ret{{[l|q]}} 1460entry: 1461 %and.i.i = and <2 x i64> %__B, %__A 1462 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1463 %1 = icmp ne <4 x i32> %0, zeroinitializer 1464 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1465 %3 = bitcast <8 x i1> %2 to i8 1466 ret i8 %3 1467} 1468 1469define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1470; X86-LABEL: test_mm_mask_test_epi32_mask: 1471; X86: # %bb.0: # %entry 1472; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1473; X86-NEXT: kmovw %eax, %k1 1474; X86-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} 1475; X86-NEXT: kmovw %k0, %eax 1476; X86-NEXT: movzbl %al, %eax 1477; X86-NEXT: retl 1478; 1479; X64-LABEL: test_mm_mask_test_epi32_mask: 1480; X64: # %bb.0: # %entry 1481; X64-NEXT: kmovw %edi, %k1 1482; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} 1483; X64-NEXT: kmovw %k0, %eax 1484; X64-NEXT: movzbl %al, %eax 1485; X64-NEXT: retq 1486entry: 1487 %and.i.i = and <2 x i64> %__B, %__A 1488 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1489 %1 = icmp ne <4 x i32> %0, zeroinitializer 1490 %2 = bitcast i8 %__U to <8 x i1> 1491 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1492 %3 = and <4 x i1> %1, %extract.i 1493 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1494 %5 = bitcast <8 x i1> %4 to i8 1495 ret i8 %5 1496} 1497 1498define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) { 1499; CHECK-LABEL: test_mm256_test_epi32_mask: 1500; CHECK: # %bb.0: # %entry 1501; CHECK-NEXT: vptestmd %ymm0, %ymm1, %k0 1502; CHECK-NEXT: kmovw %k0, %eax 1503; CHECK-NEXT: movzbl %al, %eax 1504; CHECK-NEXT: vzeroupper 1505; CHECK-NEXT: ret{{[l|q]}} 1506entry: 1507 %and.i.i = and <4 x i64> %__B, %__A 1508 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1509 %1 = icmp ne <8 x i32> %0, zeroinitializer 1510 %2 = bitcast <8 x i1> %1 to i8 1511 ret i8 %2 1512} 1513 1514define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1515; X86-LABEL: test_mm256_mask_test_epi32_mask: 1516; X86: # %bb.0: # %entry 1517; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1518; X86-NEXT: kmovw %eax, %k1 1519; X86-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} 1520; X86-NEXT: kmovw %k0, %eax 1521; X86-NEXT: movzbl %al, %eax 1522; X86-NEXT: vzeroupper 1523; X86-NEXT: retl 1524; 1525; X64-LABEL: test_mm256_mask_test_epi32_mask: 1526; X64: # %bb.0: # %entry 1527; X64-NEXT: kmovw %edi, %k1 1528; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} 1529; X64-NEXT: kmovw %k0, %eax 1530; X64-NEXT: movzbl %al, %eax 1531; X64-NEXT: vzeroupper 1532; X64-NEXT: retq 1533entry: 1534 %and.i.i = and <4 x i64> %__B, %__A 1535 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1536 %1 = icmp ne <8 x i32> %0, zeroinitializer 1537 %2 = bitcast i8 %__U to <8 x i1> 1538 %3 = and <8 x i1> %1, %2 1539 %4 = bitcast <8 x i1> %3 to i8 1540 ret i8 %4 1541} 1542 1543define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) { 1544; CHECK-LABEL: test_mm_test_epi64_mask: 1545; CHECK: # %bb.0: # %entry 1546; CHECK-NEXT: vptestmq %xmm0, %xmm1, %k0 1547; CHECK-NEXT: kmovw %k0, %eax 1548; CHECK-NEXT: movzbl %al, %eax 1549; CHECK-NEXT: ret{{[l|q]}} 1550entry: 1551 %and.i.i = and <2 x i64> %__B, %__A 1552 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer 1553 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1554 %2 = bitcast <8 x i1> %1 to i8 1555 ret i8 %2 1556} 1557 1558define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1559; X86-LABEL: test_mm_mask_test_epi64_mask: 1560; X86: # %bb.0: # %entry 1561; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1562; X86-NEXT: kmovw %eax, %k1 1563; X86-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1} 1564; X86-NEXT: kmovw %k0, %eax 1565; X86-NEXT: movzbl %al, %eax 1566; X86-NEXT: retl 1567; 1568; X64-LABEL: test_mm_mask_test_epi64_mask: 1569; X64: # %bb.0: # %entry 1570; X64-NEXT: kmovw %edi, %k1 1571; X64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1} 1572; X64-NEXT: kmovw %k0, %eax 1573; X64-NEXT: movzbl %al, %eax 1574; X64-NEXT: retq 1575entry: 1576 %and.i.i = and <2 x i64> %__B, %__A 1577 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer 1578 %1 = bitcast i8 %__U to <8 x i1> 1579 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1580 %2 = and <2 x i1> %0, %extract.i 1581 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1582 %4 = bitcast <8 x i1> %3 to i8 1583 ret i8 %4 1584} 1585 1586define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) { 1587; CHECK-LABEL: test_mm256_test_epi64_mask: 1588; CHECK: # %bb.0: # %entry 1589; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k0 1590; CHECK-NEXT: kmovw %k0, %eax 1591; CHECK-NEXT: movzbl %al, %eax 1592; CHECK-NEXT: vzeroupper 1593; CHECK-NEXT: ret{{[l|q]}} 1594entry: 1595 %and.i.i = and <4 x i64> %__B, %__A 1596 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer 1597 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1598 %2 = bitcast <8 x i1> %1 to i8 1599 ret i8 %2 1600} 1601 1602define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1603; X86-LABEL: test_mm256_mask_test_epi64_mask: 1604; X86: # %bb.0: # %entry 1605; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1606; X86-NEXT: kmovw %eax, %k1 1607; X86-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1} 1608; X86-NEXT: kmovw %k0, %eax 1609; X86-NEXT: movzbl %al, %eax 1610; X86-NEXT: vzeroupper 1611; X86-NEXT: retl 1612; 1613; X64-LABEL: test_mm256_mask_test_epi64_mask: 1614; X64: # %bb.0: # %entry 1615; X64-NEXT: kmovw %edi, %k1 1616; X64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1} 1617; X64-NEXT: kmovw %k0, %eax 1618; X64-NEXT: movzbl %al, %eax 1619; X64-NEXT: vzeroupper 1620; X64-NEXT: retq 1621entry: 1622 %and.i.i = and <4 x i64> %__B, %__A 1623 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer 1624 %1 = bitcast i8 %__U to <8 x i1> 1625 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1626 %2 = and <4 x i1> %0, %extract.i 1627 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1628 %4 = bitcast <8 x i1> %3 to i8 1629 ret i8 %4 1630} 1631 1632define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) { 1633; CHECK-LABEL: test_mm_testn_epi32_mask: 1634; CHECK: # %bb.0: # %entry 1635; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k0 1636; CHECK-NEXT: kmovw %k0, %eax 1637; CHECK-NEXT: movzbl %al, %eax 1638; CHECK-NEXT: ret{{[l|q]}} 1639entry: 1640 %and.i.i = and <2 x i64> %__B, %__A 1641 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1642 %1 = icmp eq <4 x i32> %0, zeroinitializer 1643 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1644 %3 = bitcast <8 x i1> %2 to i8 1645 ret i8 %3 1646} 1647 1648define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1649; X86-LABEL: test_mm_mask_testn_epi32_mask: 1650; X86: # %bb.0: # %entry 1651; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1652; X86-NEXT: kmovw %eax, %k1 1653; X86-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} 1654; X86-NEXT: kmovw %k0, %eax 1655; X86-NEXT: movzbl %al, %eax 1656; X86-NEXT: retl 1657; 1658; X64-LABEL: test_mm_mask_testn_epi32_mask: 1659; X64: # %bb.0: # %entry 1660; X64-NEXT: kmovw %edi, %k1 1661; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} 1662; X64-NEXT: kmovw %k0, %eax 1663; X64-NEXT: movzbl %al, %eax 1664; X64-NEXT: retq 1665entry: 1666 %and.i.i = and <2 x i64> %__B, %__A 1667 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1668 %1 = icmp eq <4 x i32> %0, zeroinitializer 1669 %2 = bitcast i8 %__U to <8 x i1> 1670 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1671 %3 = and <4 x i1> %1, %extract.i 1672 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1673 %5 = bitcast <8 x i1> %4 to i8 1674 ret i8 %5 1675} 1676 1677define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) { 1678; CHECK-LABEL: test_mm256_testn_epi32_mask: 1679; CHECK: # %bb.0: # %entry 1680; CHECK-NEXT: vptestnmd %ymm0, %ymm1, %k0 1681; CHECK-NEXT: kmovw %k0, %eax 1682; CHECK-NEXT: movzbl %al, %eax 1683; CHECK-NEXT: vzeroupper 1684; CHECK-NEXT: ret{{[l|q]}} 1685entry: 1686 %and.i.i = and <4 x i64> %__B, %__A 1687 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1688 %1 = icmp eq <8 x i32> %0, zeroinitializer 1689 %2 = bitcast <8 x i1> %1 to i8 1690 ret i8 %2 1691} 1692 1693define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1694; X86-LABEL: test_mm256_mask_testn_epi32_mask: 1695; X86: # %bb.0: # %entry 1696; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1697; X86-NEXT: kmovw %eax, %k1 1698; X86-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} 1699; X86-NEXT: kmovw %k0, %eax 1700; X86-NEXT: movzbl %al, %eax 1701; X86-NEXT: vzeroupper 1702; X86-NEXT: retl 1703; 1704; X64-LABEL: test_mm256_mask_testn_epi32_mask: 1705; X64: # %bb.0: # %entry 1706; X64-NEXT: kmovw %edi, %k1 1707; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} 1708; X64-NEXT: kmovw %k0, %eax 1709; X64-NEXT: movzbl %al, %eax 1710; X64-NEXT: vzeroupper 1711; X64-NEXT: retq 1712entry: 1713 %and.i.i = and <4 x i64> %__B, %__A 1714 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1715 %1 = icmp eq <8 x i32> %0, zeroinitializer 1716 %2 = bitcast i8 %__U to <8 x i1> 1717 %3 = and <8 x i1> %1, %2 1718 %4 = bitcast <8 x i1> %3 to i8 1719 ret i8 %4 1720} 1721 1722define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) { 1723; CHECK-LABEL: test_mm_testn_epi64_mask: 1724; CHECK: # %bb.0: # %entry 1725; CHECK-NEXT: vptestnmq %xmm0, %xmm1, %k0 1726; CHECK-NEXT: kmovw %k0, %eax 1727; CHECK-NEXT: movzbl %al, %eax 1728; CHECK-NEXT: ret{{[l|q]}} 1729entry: 1730 %and.i.i = and <2 x i64> %__B, %__A 1731 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer 1732 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1733 %2 = bitcast <8 x i1> %1 to i8 1734 ret i8 %2 1735} 1736 1737define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1738; X86-LABEL: test_mm_mask_testn_epi64_mask: 1739; X86: # %bb.0: # %entry 1740; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1741; X86-NEXT: kmovw %eax, %k1 1742; X86-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1} 1743; X86-NEXT: kmovw %k0, %eax 1744; X86-NEXT: movzbl %al, %eax 1745; X86-NEXT: retl 1746; 1747; X64-LABEL: test_mm_mask_testn_epi64_mask: 1748; X64: # %bb.0: # %entry 1749; X64-NEXT: kmovw %edi, %k1 1750; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1} 1751; X64-NEXT: kmovw %k0, %eax 1752; X64-NEXT: movzbl %al, %eax 1753; X64-NEXT: retq 1754entry: 1755 %and.i.i = and <2 x i64> %__B, %__A 1756 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer 1757 %1 = bitcast i8 %__U to <8 x i1> 1758 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1759 %2 = and <2 x i1> %0, %extract.i 1760 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1761 %4 = bitcast <8 x i1> %3 to i8 1762 ret i8 %4 1763} 1764 1765define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) { 1766; CHECK-LABEL: test_mm256_testn_epi64_mask: 1767; CHECK: # %bb.0: # %entry 1768; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k0 1769; CHECK-NEXT: kmovw %k0, %eax 1770; CHECK-NEXT: movzbl %al, %eax 1771; CHECK-NEXT: vzeroupper 1772; CHECK-NEXT: ret{{[l|q]}} 1773entry: 1774 %and.i.i = and <4 x i64> %__B, %__A 1775 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer 1776 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1777 %2 = bitcast <8 x i1> %1 to i8 1778 ret i8 %2 1779} 1780 1781define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1782; X86-LABEL: test_mm256_mask_testn_epi64_mask: 1783; X86: # %bb.0: # %entry 1784; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1785; X86-NEXT: kmovw %eax, %k1 1786; X86-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1} 1787; X86-NEXT: kmovw %k0, %eax 1788; X86-NEXT: movzbl %al, %eax 1789; X86-NEXT: vzeroupper 1790; X86-NEXT: retl 1791; 1792; X64-LABEL: test_mm256_mask_testn_epi64_mask: 1793; X64: # %bb.0: # %entry 1794; X64-NEXT: kmovw %edi, %k1 1795; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1} 1796; X64-NEXT: kmovw %k0, %eax 1797; X64-NEXT: movzbl %al, %eax 1798; X64-NEXT: vzeroupper 1799; X64-NEXT: retq 1800entry: 1801 %and.i.i = and <4 x i64> %__B, %__A 1802 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer 1803 %1 = bitcast i8 %__U to <8 x i1> 1804 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1805 %2 = and <4 x i1> %0, %extract.i 1806 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1807 %4 = bitcast <8 x i1> %3 to i8 1808 ret i8 %4 1809} 1810 1811define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) { 1812; X86-LABEL: test_mm_mask_set1_epi32: 1813; X86: # %bb.0: # %entry 1814; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1815; X86-NEXT: kmovw %eax, %k1 1816; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} 1817; X86-NEXT: retl 1818; 1819; X64-LABEL: test_mm_mask_set1_epi32: 1820; X64: # %bb.0: # %entry 1821; X64-NEXT: kmovw %edi, %k1 1822; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} 1823; X64-NEXT: retq 1824entry: 1825 %0 = bitcast <2 x i64> %__O to <4 x i32> 1826 %1 = bitcast i8 %__M to <8 x i1> 1827 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1828 %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0 1829 %3 = bitcast <4 x i32> %2 to <2 x i64> 1830 ret <2 x i64> %3 1831} 1832 1833define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) { 1834; X86-LABEL: test_mm_maskz_set1_epi32: 1835; X86: # %bb.0: # %entry 1836; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1837; X86-NEXT: kmovw %eax, %k1 1838; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} {z} 1839; X86-NEXT: retl 1840; 1841; X64-LABEL: test_mm_maskz_set1_epi32: 1842; X64: # %bb.0: # %entry 1843; X64-NEXT: kmovw %edi, %k1 1844; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} 1845; X64-NEXT: retq 1846entry: 1847 %0 = bitcast i8 %__M to <8 x i1> 1848 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1849 %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer 1850 %2 = bitcast <4 x i32> %1 to <2 x i64> 1851 ret <2 x i64> %2 1852} 1853 1854define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M) { 1855; X86-LABEL: test_mm256_mask_set1_epi32: 1856; X86: # %bb.0: # %entry 1857; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1858; X86-NEXT: kmovw %eax, %k1 1859; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} 1860; X86-NEXT: retl 1861; 1862; X64-LABEL: test_mm256_mask_set1_epi32: 1863; X64: # %bb.0: # %entry 1864; X64-NEXT: kmovw %edi, %k1 1865; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} 1866; X64-NEXT: retq 1867entry: 1868 %0 = bitcast <4 x i64> %__O to <8 x i32> 1869 %1 = bitcast i8 %__M to <8 x i1> 1870 %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0 1871 %3 = bitcast <8 x i32> %2 to <4 x i64> 1872 ret <4 x i64> %3 1873} 1874 1875define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M) { 1876; X86-LABEL: test_mm256_maskz_set1_epi32: 1877; X86: # %bb.0: # %entry 1878; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1879; X86-NEXT: kmovw %eax, %k1 1880; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} {z} 1881; X86-NEXT: retl 1882; 1883; X64-LABEL: test_mm256_maskz_set1_epi32: 1884; X64: # %bb.0: # %entry 1885; X64-NEXT: kmovw %edi, %k1 1886; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} 1887; X64-NEXT: retq 1888entry: 1889 %0 = bitcast i8 %__M to <8 x i1> 1890 %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer 1891 %2 = bitcast <8 x i32> %1 to <4 x i64> 1892 ret <4 x i64> %2 1893} 1894 1895define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A) { 1896; X86-LABEL: test_mm_mask_set1_epi64: 1897; X86: # %bb.0: # %entry 1898; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1899; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1900; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1901; X86-NEXT: kmovw %eax, %k1 1902; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 1903; X86-NEXT: retl 1904; 1905; X64-LABEL: test_mm_mask_set1_epi64: 1906; X64: # %bb.0: # %entry 1907; X64-NEXT: kmovw %edi, %k1 1908; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1} 1909; X64-NEXT: retq 1910entry: 1911 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0 1912 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer 1913 %0 = bitcast i8 %__M to <8 x i1> 1914 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1915 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> %__O 1916 ret <2 x i64> %1 1917} 1918 1919define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { 1920; X86-LABEL: test_mm_maskz_set1_epi64: 1921; X86: # %bb.0: # %entry 1922; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1923; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1924; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1925; X86-NEXT: kmovw %eax, %k1 1926; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 1927; X86-NEXT: retl 1928; 1929; X64-LABEL: test_mm_maskz_set1_epi64: 1930; X64: # %bb.0: # %entry 1931; X64-NEXT: kmovw %edi, %k1 1932; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1} {z} 1933; X64-NEXT: retq 1934entry: 1935 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0 1936 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer 1937 %0 = bitcast i8 %__M to <8 x i1> 1938 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1939 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> zeroinitializer 1940 ret <2 x i64> %1 1941} 1942 1943 1944define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) { 1945; X86-LABEL: test_mm256_mask_set1_epi64: 1946; X86: # %bb.0: # %entry 1947; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1948; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1949; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1950; X86-NEXT: kmovw %eax, %k1 1951; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 1952; X86-NEXT: retl 1953; 1954; X64-LABEL: test_mm256_mask_set1_epi64: 1955; X64: # %bb.0: # %entry 1956; X64-NEXT: kmovw %edi, %k1 1957; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} 1958; X64-NEXT: retq 1959entry: 1960 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0 1961 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer 1962 %0 = bitcast i8 %__M to <8 x i1> 1963 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1964 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O 1965 ret <4 x i64> %1 1966} 1967 1968define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { 1969; X86-LABEL: test_mm256_maskz_set1_epi64: 1970; X86: # %bb.0: # %entry 1971; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1972; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1973; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1974; X86-NEXT: kmovw %eax, %k1 1975; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 1976; X86-NEXT: retl 1977; 1978; X64-LABEL: test_mm256_maskz_set1_epi64: 1979; X64: # %bb.0: # %entry 1980; X64-NEXT: kmovw %edi, %k1 1981; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} {z} 1982; X64-NEXT: retq 1983entry: 1984 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0 1985 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer 1986 %0 = bitcast i8 %__M to <8 x i1> 1987 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1988 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer 1989 ret <4 x i64> %1 1990} 1991 1992define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { 1993; CHECK-LABEL: test_mm_broadcastd_epi32: 1994; CHECK: # %bb.0: 1995; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 1996; CHECK-NEXT: ret{{[l|q]}} 1997 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 1998 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer 1999 %res1 = bitcast <4 x i32> %res0 to <2 x i64> 2000 ret <2 x i64> %res1 2001} 2002 2003define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) { 2004; X86-LABEL: test_mm_mask_broadcastd_epi32: 2005; X86: # %bb.0: # %entry 2006; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2007; X86-NEXT: kmovw %eax, %k1 2008; X86-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 2009; X86-NEXT: retl 2010; 2011; X64-LABEL: test_mm_mask_broadcastd_epi32: 2012; X64: # %bb.0: # %entry 2013; X64-NEXT: kmovw %edi, %k1 2014; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 2015; X64-NEXT: retq 2016entry: 2017 %0 = bitcast <2 x i64> %__A to <4 x i32> 2018 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 2019 %1 = bitcast <2 x i64> %__O to <4 x i32> 2020 %2 = bitcast i8 %__M to <8 x i1> 2021 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2022 %3 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> %1 2023 %4 = bitcast <4 x i32> %3 to <2 x i64> 2024 ret <2 x i64> %4 2025} 2026 2027define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 zeroext %__M, <2 x i64> %__A) { 2028; X86-LABEL: test_mm_maskz_broadcastd_epi32: 2029; X86: # %bb.0: # %entry 2030; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2031; X86-NEXT: kmovw %eax, %k1 2032; X86-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} 2033; X86-NEXT: retl 2034; 2035; X64-LABEL: test_mm_maskz_broadcastd_epi32: 2036; X64: # %bb.0: # %entry 2037; X64-NEXT: kmovw %edi, %k1 2038; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} 2039; X64-NEXT: retq 2040entry: 2041 %0 = bitcast <2 x i64> %__A to <4 x i32> 2042 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 2043 %1 = bitcast i8 %__M to <8 x i1> 2044 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2045 %2 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> zeroinitializer 2046 %3 = bitcast <4 x i32> %2 to <2 x i64> 2047 ret <2 x i64> %3 2048} 2049 2050define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) { 2051; CHECK-LABEL: test_mm256_broadcastd_epi32: 2052; CHECK: # %bb.0: 2053; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 2054; CHECK-NEXT: ret{{[l|q]}} 2055 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2056 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer 2057 %res1 = bitcast <8 x i32> %res0 to <4 x i64> 2058 ret <4 x i64> %res1 2059} 2060 2061define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) { 2062; X86-LABEL: test_mm256_mask_broadcastd_epi32: 2063; X86: # %bb.0: 2064; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2065; X86-NEXT: kmovw %eax, %k1 2066; X86-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1} 2067; X86-NEXT: retl 2068; 2069; X64-LABEL: test_mm256_mask_broadcastd_epi32: 2070; X64: # %bb.0: 2071; X64-NEXT: kmovw %edi, %k1 2072; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1} 2073; X64-NEXT: retq 2074 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2075 %arg1 = bitcast i8 %a1 to <8 x i1> 2076 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 2077 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer 2078 %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0 2079 %res2 = bitcast <8 x i32> %res1 to <4 x i64> 2080 ret <4 x i64> %res2 2081} 2082 2083define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) { 2084; X86-LABEL: test_mm256_maskz_broadcastd_epi32: 2085; X86: # %bb.0: 2086; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2087; X86-NEXT: kmovw %eax, %k1 2088; X86-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} 2089; X86-NEXT: retl 2090; 2091; X64-LABEL: test_mm256_maskz_broadcastd_epi32: 2092; X64: # %bb.0: 2093; X64-NEXT: kmovw %edi, %k1 2094; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} 2095; X64-NEXT: retq 2096 %arg0 = bitcast i8 %a0 to <8 x i1> 2097 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2098 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer 2099 %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer 2100 %res2 = bitcast <8 x i32> %res1 to <4 x i64> 2101 ret <4 x i64> %res2 2102} 2103 2104define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { 2105; CHECK-LABEL: test_mm_broadcastq_epi64: 2106; CHECK: # %bb.0: 2107; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 2108; CHECK-NEXT: ret{{[l|q]}} 2109 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 2110 ret <2 x i64> %res 2111} 2112 2113define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) { 2114; X86-LABEL: test_mm_mask_broadcastq_epi64: 2115; X86: # %bb.0: # %entry 2116; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2117; X86-NEXT: kmovw %eax, %k1 2118; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 2119; X86-NEXT: retl 2120; 2121; X64-LABEL: test_mm_mask_broadcastq_epi64: 2122; X64: # %bb.0: # %entry 2123; X64-NEXT: kmovw %edi, %k1 2124; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 2125; X64-NEXT: retq 2126entry: 2127 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer 2128 %0 = bitcast i8 %__M to <8 x i1> 2129 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2130 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> %__O 2131 ret <2 x i64> %1 2132} 2133 2134define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) { 2135; X86-LABEL: test_mm_maskz_broadcastq_epi64: 2136; X86: # %bb.0: # %entry 2137; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2138; X86-NEXT: kmovw %eax, %k1 2139; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 2140; X86-NEXT: retl 2141; 2142; X64-LABEL: test_mm_maskz_broadcastq_epi64: 2143; X64: # %bb.0: # %entry 2144; X64-NEXT: kmovw %edi, %k1 2145; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 2146; X64-NEXT: retq 2147entry: 2148 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer 2149 %0 = bitcast i8 %__M to <8 x i1> 2150 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2151 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> zeroinitializer 2152 ret <2 x i64> %1 2153} 2154 2155define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) { 2156; CHECK-LABEL: test_mm256_broadcastq_epi64: 2157; CHECK: # %bb.0: 2158; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 2159; CHECK-NEXT: ret{{[l|q]}} 2160 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer 2161 ret <4 x i64> %res 2162} 2163 2164define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) { 2165; X86-LABEL: test_mm256_mask_broadcastq_epi64: 2166; X86: # %bb.0: # %entry 2167; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2168; X86-NEXT: kmovw %eax, %k1 2169; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 2170; X86-NEXT: retl 2171; 2172; X64-LABEL: test_mm256_mask_broadcastq_epi64: 2173; X64: # %bb.0: # %entry 2174; X64-NEXT: kmovw %edi, %k1 2175; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 2176; X64-NEXT: retq 2177entry: 2178 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer 2179 %0 = bitcast i8 %__M to <8 x i1> 2180 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2181 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> %__O 2182 ret <4 x i64> %1 2183} 2184 2185define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) { 2186; X86-LABEL: test_mm256_maskz_broadcastq_epi64: 2187; X86: # %bb.0: # %entry 2188; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2189; X86-NEXT: kmovw %eax, %k1 2190; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 2191; X86-NEXT: retl 2192; 2193; X64-LABEL: test_mm256_maskz_broadcastq_epi64: 2194; X64: # %bb.0: # %entry 2195; X64-NEXT: kmovw %edi, %k1 2196; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 2197; X64-NEXT: retq 2198entry: 2199 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer 2200 %0 = bitcast i8 %__M to <8 x i1> 2201 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2202 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> zeroinitializer 2203 ret <4 x i64> %1 2204} 2205 2206define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) { 2207; CHECK-LABEL: test_mm256_broadcastsd_pd: 2208; CHECK: # %bb.0: 2209; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 2210; CHECK-NEXT: ret{{[l|q]}} 2211 %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer 2212 ret <4 x double> %res 2213} 2214 2215define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %__O, i8 zeroext %__M, <2 x double> %__A) { 2216; X86-LABEL: test_mm256_mask_broadcastsd_pd: 2217; X86: # %bb.0: # %entry 2218; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2219; X86-NEXT: kmovw %eax, %k1 2220; X86-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} 2221; X86-NEXT: retl 2222; 2223; X64-LABEL: test_mm256_mask_broadcastsd_pd: 2224; X64: # %bb.0: # %entry 2225; X64-NEXT: kmovw %edi, %k1 2226; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} 2227; X64-NEXT: retq 2228entry: 2229 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer 2230 %0 = bitcast i8 %__M to <8 x i1> 2231 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2232 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__O 2233 ret <4 x double> %1 2234} 2235 2236define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 zeroext %__M, <2 x double> %__A) { 2237; X86-LABEL: test_mm256_maskz_broadcastsd_pd: 2238; X86: # %bb.0: # %entry 2239; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2240; X86-NEXT: kmovw %eax, %k1 2241; X86-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} 2242; X86-NEXT: retl 2243; 2244; X64-LABEL: test_mm256_maskz_broadcastsd_pd: 2245; X64: # %bb.0: # %entry 2246; X64-NEXT: kmovw %edi, %k1 2247; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} 2248; X64-NEXT: retq 2249entry: 2250 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer 2251 %0 = bitcast i8 %__M to <8 x i1> 2252 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2253 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer 2254 ret <4 x double> %1 2255} 2256 2257define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { 2258; CHECK-LABEL: test_mm_broadcastss_ps: 2259; CHECK: # %bb.0: 2260; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 2261; CHECK-NEXT: ret{{[l|q]}} 2262 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 2263 ret <4 x float> %res 2264} 2265 2266define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %__O, i8 zeroext %__M, <4 x float> %__A) { 2267; X86-LABEL: test_mm_mask_broadcastss_ps: 2268; X86: # %bb.0: # %entry 2269; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2270; X86-NEXT: kmovw %eax, %k1 2271; X86-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} 2272; X86-NEXT: retl 2273; 2274; X64-LABEL: test_mm_mask_broadcastss_ps: 2275; X64: # %bb.0: # %entry 2276; X64-NEXT: kmovw %edi, %k1 2277; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} 2278; X64-NEXT: retq 2279entry: 2280 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer 2281 %0 = bitcast i8 %__M to <8 x i1> 2282 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2283 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__O 2284 ret <4 x float> %1 2285} 2286 2287define <4 x float> @test_mm_maskz_broadcastss_ps(i8 zeroext %__M, <4 x float> %__A) { 2288; X86-LABEL: test_mm_maskz_broadcastss_ps: 2289; X86: # %bb.0: # %entry 2290; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2291; X86-NEXT: kmovw %eax, %k1 2292; X86-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} 2293; X86-NEXT: retl 2294; 2295; X64-LABEL: test_mm_maskz_broadcastss_ps: 2296; X64: # %bb.0: # %entry 2297; X64-NEXT: kmovw %edi, %k1 2298; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} 2299; X64-NEXT: retq 2300entry: 2301 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer 2302 %0 = bitcast i8 %__M to <8 x i1> 2303 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2304 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer 2305 ret <4 x float> %1 2306} 2307 2308define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) { 2309; CHECK-LABEL: test_mm256_broadcastss_ps: 2310; CHECK: # %bb.0: 2311; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 2312; CHECK-NEXT: ret{{[l|q]}} 2313 %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer 2314 ret <8 x float> %res 2315} 2316 2317define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) { 2318; X86-LABEL: test_mm256_mask_broadcastss_ps: 2319; X86: # %bb.0: 2320; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2321; X86-NEXT: kmovw %eax, %k1 2322; X86-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} 2323; X86-NEXT: retl 2324; 2325; X64-LABEL: test_mm256_mask_broadcastss_ps: 2326; X64: # %bb.0: 2327; X64-NEXT: kmovw %edi, %k1 2328; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} 2329; X64-NEXT: retq 2330 %arg1 = bitcast i8 %a1 to <8 x i1> 2331 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer 2332 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2333 ret <8 x float> %res1 2334} 2335 2336define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) { 2337; X86-LABEL: test_mm256_maskz_broadcastss_ps: 2338; X86: # %bb.0: 2339; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2340; X86-NEXT: kmovw %eax, %k1 2341; X86-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} 2342; X86-NEXT: retl 2343; 2344; X64-LABEL: test_mm256_maskz_broadcastss_ps: 2345; X64: # %bb.0: 2346; X64-NEXT: kmovw %edi, %k1 2347; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} 2348; X64-NEXT: retq 2349 %arg0 = bitcast i8 %a0 to <8 x i1> 2350 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer 2351 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2352 ret <8 x float> %res1 2353} 2354 2355define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) { 2356; CHECK-LABEL: test_mm_movddup_pd: 2357; CHECK: # %bb.0: 2358; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2359; CHECK-NEXT: ret{{[l|q]}} 2360 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 2361 ret <2 x double> %res 2362} 2363 2364define <2 x double> @test_mm_mask_movedup_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) { 2365; X86-LABEL: test_mm_mask_movedup_pd: 2366; X86: # %bb.0: # %entry 2367; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2368; X86-NEXT: kmovw %eax, %k1 2369; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 2370; X86-NEXT: retl 2371; 2372; X64-LABEL: test_mm_mask_movedup_pd: 2373; X64: # %bb.0: # %entry 2374; X64-NEXT: kmovw %edi, %k1 2375; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 2376; X64-NEXT: retq 2377entry: 2378 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer 2379 %0 = bitcast i8 %__U to <8 x i1> 2380 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2381 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> %__W 2382 ret <2 x double> %1 2383} 2384 2385define <2 x double> @test_mm_maskz_movedup_pd(i8 zeroext %__U, <2 x double> %__A) { 2386; X86-LABEL: test_mm_maskz_movedup_pd: 2387; X86: # %bb.0: # %entry 2388; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2389; X86-NEXT: kmovw %eax, %k1 2390; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 2391; X86-NEXT: retl 2392; 2393; X64-LABEL: test_mm_maskz_movedup_pd: 2394; X64: # %bb.0: # %entry 2395; X64-NEXT: kmovw %edi, %k1 2396; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 2397; X64-NEXT: retq 2398entry: 2399 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer 2400 %0 = bitcast i8 %__U to <8 x i1> 2401 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2402 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> zeroinitializer 2403 ret <2 x double> %1 2404} 2405 2406define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) { 2407; CHECK-LABEL: test_mm256_movddup_pd: 2408; CHECK: # %bb.0: 2409; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 2410; CHECK-NEXT: ret{{[l|q]}} 2411 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2412 ret <4 x double> %res 2413} 2414 2415define <4 x double> @test_mm256_mask_movedup_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) { 2416; X86-LABEL: test_mm256_mask_movedup_pd: 2417; X86: # %bb.0: # %entry 2418; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2419; X86-NEXT: kmovw %eax, %k1 2420; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] 2421; X86-NEXT: retl 2422; 2423; X64-LABEL: test_mm256_mask_movedup_pd: 2424; X64: # %bb.0: # %entry 2425; X64-NEXT: kmovw %edi, %k1 2426; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] 2427; X64-NEXT: retq 2428entry: 2429 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2430 %0 = bitcast i8 %__U to <8 x i1> 2431 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2432 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__W 2433 ret <4 x double> %1 2434} 2435 2436define <4 x double> @test_mm256_maskz_movedup_pd(i8 zeroext %__U, <4 x double> %__A) { 2437; X86-LABEL: test_mm256_maskz_movedup_pd: 2438; X86: # %bb.0: # %entry 2439; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2440; X86-NEXT: kmovw %eax, %k1 2441; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 2442; X86-NEXT: retl 2443; 2444; X64-LABEL: test_mm256_maskz_movedup_pd: 2445; X64: # %bb.0: # %entry 2446; X64-NEXT: kmovw %edi, %k1 2447; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 2448; X64-NEXT: retq 2449entry: 2450 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2451 %0 = bitcast i8 %__U to <8 x i1> 2452 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2453 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer 2454 ret <4 x double> %1 2455} 2456 2457define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) { 2458; CHECK-LABEL: test_mm_movehdup_ps: 2459; CHECK: # %bb.0: 2460; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2461; CHECK-NEXT: ret{{[l|q]}} 2462 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 2463 ret <4 x float> %res 2464} 2465 2466define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { 2467; X86-LABEL: test_mm_mask_movehdup_ps: 2468; X86: # %bb.0: # %entry 2469; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2470; X86-NEXT: kmovw %eax, %k1 2471; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] 2472; X86-NEXT: retl 2473; 2474; X64-LABEL: test_mm_mask_movehdup_ps: 2475; X64: # %bb.0: # %entry 2476; X64-NEXT: kmovw %edi, %k1 2477; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] 2478; X64-NEXT: retq 2479entry: 2480 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 2481 %0 = bitcast i8 %__U to <8 x i1> 2482 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2483 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W 2484 ret <4 x float> %1 2485} 2486 2487define <4 x float> @test_mm_maskz_movehdup_ps(i8 zeroext %__U, <4 x float> %__A) { 2488; X86-LABEL: test_mm_maskz_movehdup_ps: 2489; X86: # %bb.0: # %entry 2490; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2491; X86-NEXT: kmovw %eax, %k1 2492; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 2493; X86-NEXT: retl 2494; 2495; X64-LABEL: test_mm_maskz_movehdup_ps: 2496; X64: # %bb.0: # %entry 2497; X64-NEXT: kmovw %edi, %k1 2498; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 2499; X64-NEXT: retq 2500entry: 2501 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 2502 %0 = bitcast i8 %__U to <8 x i1> 2503 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2504 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer 2505 ret <4 x float> %1 2506} 2507 2508define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) { 2509; CHECK-LABEL: test_mm256_movehdup_ps: 2510; CHECK: # %bb.0: 2511; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 2512; CHECK-NEXT: ret{{[l|q]}} 2513 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 2514 ret <8 x float> %res 2515} 2516 2517define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) { 2518; X86-LABEL: test_mm256_mask_movehdup_ps: 2519; X86: # %bb.0: 2520; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2521; X86-NEXT: kmovw %eax, %k1 2522; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7] 2523; X86-NEXT: retl 2524; 2525; X64-LABEL: test_mm256_mask_movehdup_ps: 2526; X64: # %bb.0: 2527; X64-NEXT: kmovw %edi, %k1 2528; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7] 2529; X64-NEXT: retq 2530 %arg1 = bitcast i8 %a1 to <8 x i1> 2531 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 2532 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2533 ret <8 x float> %res1 2534} 2535 2536define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) { 2537; X86-LABEL: test_mm256_maskz_movehdup_ps: 2538; X86: # %bb.0: 2539; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2540; X86-NEXT: kmovw %eax, %k1 2541; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 2542; X86-NEXT: retl 2543; 2544; X64-LABEL: test_mm256_maskz_movehdup_ps: 2545; X64: # %bb.0: 2546; X64-NEXT: kmovw %edi, %k1 2547; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 2548; X64-NEXT: retq 2549 %arg0 = bitcast i8 %a0 to <8 x i1> 2550 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 2551 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2552 ret <8 x float> %res1 2553} 2554 2555define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) { 2556; CHECK-LABEL: test_mm_moveldup_ps: 2557; CHECK: # %bb.0: 2558; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 2559; CHECK-NEXT: ret{{[l|q]}} 2560 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2561 ret <4 x float> %res 2562} 2563 2564define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { 2565; X86-LABEL: test_mm_mask_moveldup_ps: 2566; X86: # %bb.0: # %entry 2567; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2568; X86-NEXT: kmovw %eax, %k1 2569; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] 2570; X86-NEXT: retl 2571; 2572; X64-LABEL: test_mm_mask_moveldup_ps: 2573; X64: # %bb.0: # %entry 2574; X64-NEXT: kmovw %edi, %k1 2575; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] 2576; X64-NEXT: retq 2577entry: 2578 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2579 %0 = bitcast i8 %__U to <8 x i1> 2580 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2581 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W 2582 ret <4 x float> %1 2583} 2584 2585define <4 x float> @test_mm_maskz_moveldup_ps(i8 zeroext %__U, <4 x float> %__A) { 2586; X86-LABEL: test_mm_maskz_moveldup_ps: 2587; X86: # %bb.0: # %entry 2588; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2589; X86-NEXT: kmovw %eax, %k1 2590; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 2591; X86-NEXT: retl 2592; 2593; X64-LABEL: test_mm_maskz_moveldup_ps: 2594; X64: # %bb.0: # %entry 2595; X64-NEXT: kmovw %edi, %k1 2596; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 2597; X64-NEXT: retq 2598entry: 2599 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2600 %0 = bitcast i8 %__U to <8 x i1> 2601 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2602 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer 2603 ret <4 x float> %1 2604} 2605 2606define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) { 2607; CHECK-LABEL: test_mm256_moveldup_ps: 2608; CHECK: # %bb.0: 2609; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 2610; CHECK-NEXT: ret{{[l|q]}} 2611 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 2612 ret <8 x float> %res 2613} 2614 2615define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) { 2616; X86-LABEL: test_mm256_mask_moveldup_ps: 2617; X86: # %bb.0: 2618; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2619; X86-NEXT: kmovw %eax, %k1 2620; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6] 2621; X86-NEXT: retl 2622; 2623; X64-LABEL: test_mm256_mask_moveldup_ps: 2624; X64: # %bb.0: 2625; X64-NEXT: kmovw %edi, %k1 2626; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6] 2627; X64-NEXT: retq 2628 %arg1 = bitcast i8 %a1 to <8 x i1> 2629 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 2630 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2631 ret <8 x float> %res1 2632} 2633 2634define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) { 2635; X86-LABEL: test_mm256_maskz_moveldup_ps: 2636; X86: # %bb.0: 2637; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2638; X86-NEXT: kmovw %eax, %k1 2639; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 2640; X86-NEXT: retl 2641; 2642; X64-LABEL: test_mm256_maskz_moveldup_ps: 2643; X64: # %bb.0: 2644; X64-NEXT: kmovw %edi, %k1 2645; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 2646; X64-NEXT: retq 2647 %arg0 = bitcast i8 %a0 to <8 x i1> 2648 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 2649 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2650 ret <8 x float> %res1 2651} 2652 2653define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) { 2654; CHECK-LABEL: test_mm256_permutex_epi64: 2655; CHECK: # %bb.0: 2656; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] 2657; CHECK-NEXT: ret{{[l|q]}} 2658 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2659 ret <4 x i64> %res 2660} 2661 2662define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X) { 2663; X86-LABEL: test_mm256_mask_permutex_epi64: 2664; X86: # %bb.0: # %entry 2665; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2666; X86-NEXT: kmovw %eax, %k1 2667; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0] 2668; X86-NEXT: retl 2669; 2670; X64-LABEL: test_mm256_mask_permutex_epi64: 2671; X64: # %bb.0: # %entry 2672; X64-NEXT: kmovw %edi, %k1 2673; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0] 2674; X64-NEXT: retq 2675entry: 2676 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2677 %0 = bitcast i8 %__M to <8 x i1> 2678 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2679 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> %__W 2680 ret <4 x i64> %1 2681} 2682 2683define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 zeroext %__M, <4 x i64> %__X) { 2684; X86-LABEL: test_mm256_maskz_permutex_epi64: 2685; X86: # %bb.0: # %entry 2686; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2687; X86-NEXT: kmovw %eax, %k1 2688; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] 2689; X86-NEXT: retl 2690; 2691; X64-LABEL: test_mm256_maskz_permutex_epi64: 2692; X64: # %bb.0: # %entry 2693; X64-NEXT: kmovw %edi, %k1 2694; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] 2695; X64-NEXT: retq 2696entry: 2697 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2698 %0 = bitcast i8 %__M to <8 x i1> 2699 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2700 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> zeroinitializer 2701 ret <4 x i64> %1 2702} 2703 2704define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) { 2705; CHECK-LABEL: test_mm256_permutex_pd: 2706; CHECK: # %bb.0: 2707; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] 2708; CHECK-NEXT: ret{{[l|q]}} 2709 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2710 ret <4 x double> %res 2711} 2712 2713define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__X) { 2714; X86-LABEL: test_mm256_mask_permutex_pd: 2715; X86: # %bb.0: # %entry 2716; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2717; X86-NEXT: kmovw %eax, %k1 2718; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 2719; X86-NEXT: retl 2720; 2721; X64-LABEL: test_mm256_mask_permutex_pd: 2722; X64: # %bb.0: # %entry 2723; X64-NEXT: kmovw %edi, %k1 2724; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 2725; X64-NEXT: retq 2726entry: 2727 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 2728 %0 = bitcast i8 %__U to <8 x i1> 2729 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2730 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> %__W 2731 ret <4 x double> %1 2732} 2733 2734define <4 x double> @test_mm256_maskz_permutex_pd(i8 zeroext %__U, <4 x double> %__X) { 2735; X86-LABEL: test_mm256_maskz_permutex_pd: 2736; X86: # %bb.0: # %entry 2737; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2738; X86-NEXT: kmovw %eax, %k1 2739; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 2740; X86-NEXT: retl 2741; 2742; X64-LABEL: test_mm256_maskz_permutex_pd: 2743; X64: # %bb.0: # %entry 2744; X64-NEXT: kmovw %edi, %k1 2745; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 2746; X64-NEXT: retq 2747entry: 2748 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 2749 %0 = bitcast i8 %__U to <8 x i1> 2750 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2751 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> zeroinitializer 2752 ret <4 x double> %1 2753} 2754 2755define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) { 2756; CHECK-LABEL: test_mm_shuffle_pd: 2757; CHECK: # %bb.0: 2758; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 2759; CHECK-NEXT: ret{{[l|q]}} 2760 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3> 2761 ret <2 x double> %res 2762} 2763 2764define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2765; X86-LABEL: test_mm_mask_shuffle_pd: 2766; X86: # %bb.0: # %entry 2767; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2768; X86-NEXT: kmovw %eax, %k1 2769; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] 2770; X86-NEXT: retl 2771; 2772; X64-LABEL: test_mm_mask_shuffle_pd: 2773; X64: # %bb.0: # %entry 2774; X64-NEXT: kmovw %edi, %k1 2775; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] 2776; X64-NEXT: retq 2777entry: 2778 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3> 2779 %0 = bitcast i8 %__U to <8 x i1> 2780 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2781 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> %__W 2782 ret <2 x double> %1 2783} 2784 2785define <2 x double> @test_mm_maskz_shuffle_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2786; X86-LABEL: test_mm_maskz_shuffle_pd: 2787; X86: # %bb.0: # %entry 2788; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2789; X86-NEXT: kmovw %eax, %k1 2790; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] 2791; X86-NEXT: retl 2792; 2793; X64-LABEL: test_mm_maskz_shuffle_pd: 2794; X64: # %bb.0: # %entry 2795; X64-NEXT: kmovw %edi, %k1 2796; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] 2797; X64-NEXT: retq 2798entry: 2799 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3> 2800 %0 = bitcast i8 %__U to <8 x i1> 2801 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2802 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> zeroinitializer 2803 ret <2 x double> %1 2804} 2805 2806define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) { 2807; CHECK-LABEL: test_mm256_shuffle_pd: 2808; CHECK: # %bb.0: 2809; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 2810; CHECK-NEXT: ret{{[l|q]}} 2811 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 2812 ret <4 x double> %res 2813} 2814 2815define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 2816; X86-LABEL: test_mm256_mask_shuffle_pd: 2817; X86: # %bb.0: # %entry 2818; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2819; X86-NEXT: kmovw %eax, %k1 2820; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] 2821; X86-NEXT: retl 2822; 2823; X64-LABEL: test_mm256_mask_shuffle_pd: 2824; X64: # %bb.0: # %entry 2825; X64-NEXT: kmovw %edi, %k1 2826; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] 2827; X64-NEXT: retq 2828entry: 2829 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 2830 %0 = bitcast i8 %__U to <8 x i1> 2831 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2832 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> %__W 2833 ret <4 x double> %1 2834} 2835 2836define <4 x double> @test_mm256_maskz_shuffle_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 2837; X86-LABEL: test_mm256_maskz_shuffle_pd: 2838; X86: # %bb.0: # %entry 2839; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2840; X86-NEXT: kmovw %eax, %k1 2841; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 2842; X86-NEXT: retl 2843; 2844; X64-LABEL: test_mm256_maskz_shuffle_pd: 2845; X64: # %bb.0: # %entry 2846; X64-NEXT: kmovw %edi, %k1 2847; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 2848; X64-NEXT: retq 2849entry: 2850 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 2851 %0 = bitcast i8 %__U to <8 x i1> 2852 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2853 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> zeroinitializer 2854 ret <4 x double> %1 2855} 2856 2857define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) { 2858; CHECK-LABEL: test_mm_shuffle_ps: 2859; CHECK: # %bb.0: 2860; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 2861; CHECK-NEXT: ret{{[l|q]}} 2862 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 2863 ret <4 x float> %res 2864} 2865 2866define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2867; X86-LABEL: test_mm_mask_shuffle_ps: 2868; X86: # %bb.0: # %entry 2869; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2870; X86-NEXT: kmovw %eax, %k1 2871; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] 2872; X86-NEXT: retl 2873; 2874; X64-LABEL: test_mm_mask_shuffle_ps: 2875; X64: # %bb.0: # %entry 2876; X64-NEXT: kmovw %edi, %k1 2877; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] 2878; X64-NEXT: retq 2879entry: 2880 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 2881 %0 = bitcast i8 %__U to <8 x i1> 2882 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2883 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> %__W 2884 ret <4 x float> %1 2885} 2886 2887define <4 x float> @test_mm_maskz_shuffle_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2888; X86-LABEL: test_mm_maskz_shuffle_ps: 2889; X86: # %bb.0: # %entry 2890; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2891; X86-NEXT: kmovw %eax, %k1 2892; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] 2893; X86-NEXT: retl 2894; 2895; X64-LABEL: test_mm_maskz_shuffle_ps: 2896; X64: # %bb.0: # %entry 2897; X64-NEXT: kmovw %edi, %k1 2898; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] 2899; X64-NEXT: retq 2900entry: 2901 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 2902 %0 = bitcast i8 %__U to <8 x i1> 2903 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2904 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> zeroinitializer 2905 ret <4 x float> %1 2906} 2907 2908define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) { 2909; CHECK-LABEL: test_mm256_shuffle_ps: 2910; CHECK: # %bb.0: 2911; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 2912; CHECK-NEXT: ret{{[l|q]}} 2913 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 2914 ret <8 x float> %res 2915} 2916 2917define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) { 2918; X86-LABEL: test_mm256_mask_shuffle_ps: 2919; X86: # %bb.0: 2920; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2921; X86-NEXT: kmovw %eax, %k1 2922; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4] 2923; X86-NEXT: retl 2924; 2925; X64-LABEL: test_mm256_mask_shuffle_ps: 2926; X64: # %bb.0: 2927; X64-NEXT: kmovw %edi, %k1 2928; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4] 2929; X64-NEXT: retq 2930 %arg1 = bitcast i8 %a1 to <8 x i1> 2931 %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 2932 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2933 ret <8 x float> %res1 2934} 2935 2936define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) { 2937; X86-LABEL: test_mm256_maskz_shuffle_ps: 2938; X86: # %bb.0: 2939; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2940; X86-NEXT: kmovw %eax, %k1 2941; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 2942; X86-NEXT: retl 2943; 2944; X64-LABEL: test_mm256_maskz_shuffle_ps: 2945; X64: # %bb.0: 2946; X64-NEXT: kmovw %edi, %k1 2947; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 2948; X64-NEXT: retq 2949 %arg0 = bitcast i8 %a0 to <8 x i1> 2950 %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 2951 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2952 ret <8 x float> %res1 2953} 2954 2955define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 2956; X86-LABEL: test_mm256_mask_mul_epi32: 2957; X86: # %bb.0: # %entry 2958; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2959; X86-NEXT: kmovw %eax, %k1 2960; X86-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1} 2961; X86-NEXT: retl 2962; 2963; X64-LABEL: test_mm256_mask_mul_epi32: 2964; X64: # %bb.0: # %entry 2965; X64-NEXT: kmovw %edi, %k1 2966; X64-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1} 2967; X64-NEXT: retq 2968entry: 2969 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32> 2970 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32> 2971 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32> 2972 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32> 2973 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1 2974 %tmp5 = bitcast i8 %__M to <8 x i1> 2975 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2976 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W 2977 ret <4 x i64> %tmp6 2978} 2979 2980define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 2981; X86-LABEL: test_mm256_maskz_mul_epi32: 2982; X86: # %bb.0: 2983; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2984; X86-NEXT: kmovw %eax, %k1 2985; X86-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z} 2986; X86-NEXT: retl 2987; 2988; X64-LABEL: test_mm256_maskz_mul_epi32: 2989; X64: # %bb.0: 2990; X64-NEXT: kmovw %edi, %k1 2991; X64-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z} 2992; X64-NEXT: retq 2993 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32> 2994 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32> 2995 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32> 2996 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32> 2997 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1 2998 %tmp5 = bitcast i8 %__M to <8 x i1> 2999 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3000 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer 3001 ret <4 x i64> %tmp6 3002} 3003 3004define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3005; X86-LABEL: test_mm_mask_mul_epi32: 3006; X86: # %bb.0: 3007; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3008; X86-NEXT: kmovw %eax, %k1 3009; X86-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1} 3010; X86-NEXT: retl 3011; 3012; X64-LABEL: test_mm_mask_mul_epi32: 3013; X64: # %bb.0: 3014; X64-NEXT: kmovw %edi, %k1 3015; X64-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1} 3016; X64-NEXT: retq 3017 %tmp = shl <2 x i64> %__X, <i64 32, i64 32> 3018 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32> 3019 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32> 3020 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32> 3021 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1 3022 %tmp5 = bitcast i8 %__M to <8 x i1> 3023 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3024 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W 3025 ret <2 x i64> %tmp6 3026} 3027 3028define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3029; X86-LABEL: test_mm_maskz_mul_epi32: 3030; X86: # %bb.0: 3031; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3032; X86-NEXT: kmovw %eax, %k1 3033; X86-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z} 3034; X86-NEXT: retl 3035; 3036; X64-LABEL: test_mm_maskz_mul_epi32: 3037; X64: # %bb.0: 3038; X64-NEXT: kmovw %edi, %k1 3039; X64-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z} 3040; X64-NEXT: retq 3041 %tmp = shl <2 x i64> %__X, <i64 32, i64 32> 3042 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32> 3043 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32> 3044 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32> 3045 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1 3046 %tmp5 = bitcast i8 %__M to <8 x i1> 3047 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3048 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer 3049 ret <2 x i64> %tmp6 3050} 3051 3052define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 3053; X86-LABEL: test_mm256_mask_mul_epu32: 3054; X86: # %bb.0: # %entry 3055; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3056; X86-NEXT: kmovw %eax, %k1 3057; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1} 3058; X86-NEXT: retl 3059; 3060; X64-LABEL: test_mm256_mask_mul_epu32: 3061; X64: # %bb.0: # %entry 3062; X64-NEXT: kmovw %edi, %k1 3063; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1} 3064; X64-NEXT: retq 3065entry: 3066 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3067 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3068 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp 3069 %tmp3 = bitcast i8 %__M to <8 x i1> 3070 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3071 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W 3072 ret <4 x i64> %tmp4 3073} 3074 3075define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 3076; X86-LABEL: test_mm256_maskz_mul_epu32: 3077; X86: # %bb.0: # %entry 3078; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3079; X86-NEXT: kmovw %eax, %k1 3080; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z} 3081; X86-NEXT: retl 3082; 3083; X64-LABEL: test_mm256_maskz_mul_epu32: 3084; X64: # %bb.0: # %entry 3085; X64-NEXT: kmovw %edi, %k1 3086; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z} 3087; X64-NEXT: retq 3088entry: 3089 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3090 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3091 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp 3092 %tmp3 = bitcast i8 %__M to <8 x i1> 3093 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3094 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer 3095 ret <4 x i64> %tmp4 3096} 3097 3098define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3099; X86-LABEL: test_mm_mask_mul_epu32: 3100; X86: # %bb.0: # %entry 3101; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3102; X86-NEXT: kmovw %eax, %k1 3103; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1} 3104; X86-NEXT: retl 3105; 3106; X64-LABEL: test_mm_mask_mul_epu32: 3107; X64: # %bb.0: # %entry 3108; X64-NEXT: kmovw %edi, %k1 3109; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1} 3110; X64-NEXT: retq 3111entry: 3112 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295> 3113 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295> 3114 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp 3115 %tmp3 = bitcast i8 %__M to <8 x i1> 3116 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3117 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W 3118 ret <2 x i64> %tmp4 3119} 3120 3121define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3122; X86-LABEL: test_mm_maskz_mul_epu32: 3123; X86: # %bb.0: # %entry 3124; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3125; X86-NEXT: kmovw %eax, %k1 3126; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z} 3127; X86-NEXT: retl 3128; 3129; X64-LABEL: test_mm_maskz_mul_epu32: 3130; X64: # %bb.0: # %entry 3131; X64-NEXT: kmovw %edi, %k1 3132; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z} 3133; X64-NEXT: retq 3134entry: 3135 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295> 3136 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295> 3137 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp 3138 %tmp3 = bitcast i8 %__M to <8 x i1> 3139 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3140 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer 3141 ret <2 x i64> %tmp4 3142} 3143 3144define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) { 3145; CHECK-LABEL: test_mm_cvtepi32_epi8: 3146; CHECK: # %bb.0: # %entry 3147; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3148; CHECK-NEXT: ret{{[l|q]}} 3149entry: 3150 %0 = bitcast <2 x i64> %__A to <4 x i32> 3151 %conv.i = trunc <4 x i32> %0 to <4 x i8> 3152 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 3153 %1 = bitcast <16 x i8> %shuf.i to <2 x i64> 3154 ret <2 x i64> %1 3155} 3156 3157define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) { 3158; CHECK-LABEL: test_mm_cvtepi32_epi16: 3159; CHECK: # %bb.0: # %entry 3160; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 3161; CHECK-NEXT: ret{{[l|q]}} 3162entry: 3163 %0 = bitcast <2 x i64> %__A to <4 x i32> 3164 %conv.i = trunc <4 x i32> %0 to <4 x i16> 3165 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3166 %1 = bitcast <8 x i16> %shuf.i to <2 x i64> 3167 ret <2 x i64> %1 3168} 3169 3170define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) { 3171; CHECK-LABEL: test_mm_cvtepi64_epi8: 3172; CHECK: # %bb.0: # %entry 3173; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3174; CHECK-NEXT: ret{{[l|q]}} 3175entry: 3176 %conv.i = trunc <2 x i64> %__A to <2 x i8> 3177 %shuf.i = shufflevector <2 x i8> %conv.i, <2 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 3178 %0 = bitcast <16 x i8> %shuf.i to <2 x i64> 3179 ret <2 x i64> %0 3180} 3181 3182define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) { 3183; CHECK-LABEL: test_mm_cvtepi64_epi16: 3184; CHECK: # %bb.0: # %entry 3185; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3186; CHECK-NEXT: ret{{[l|q]}} 3187entry: 3188 %conv.i = trunc <2 x i64> %__A to <2 x i16> 3189 %shuf.i = shufflevector <2 x i16> %conv.i, <2 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 3190 %0 = bitcast <8 x i16> %shuf.i to <2 x i64> 3191 ret <2 x i64> %0 3192} 3193 3194define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) { 3195; CHECK-LABEL: test_mm_cvtepi64_epi32: 3196; CHECK: # %bb.0: # %entry 3197; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 3198; CHECK-NEXT: ret{{[l|q]}} 3199entry: 3200 %conv.i = trunc <2 x i64> %__A to <2 x i32> 3201 %shuf.i = shufflevector <2 x i32> %conv.i, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3202 %0 = bitcast <4 x i32> %shuf.i to <2 x i64> 3203 ret <2 x i64> %0 3204} 3205 3206define <2 x i64> @test_mm256_cvtepi32_epi16(<4 x i64> %__A) local_unnamed_addr #0 { 3207; CHECK-LABEL: test_mm256_cvtepi32_epi16: 3208; CHECK: # %bb.0: # %entry 3209; CHECK-NEXT: vpmovdw %ymm0, %xmm0 3210; CHECK-NEXT: vzeroupper 3211; CHECK-NEXT: ret{{[l|q]}} 3212entry: 3213 %0 = bitcast <4 x i64> %__A to <8 x i32> 3214 %conv.i = trunc <8 x i32> %0 to <8 x i16> 3215 %1 = bitcast <8 x i16> %conv.i to <2 x i64> 3216 ret <2 x i64> %1 3217} 3218 3219define <2 x i64> @test_mm256_mask_cvtepi32_epi16(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) { 3220; X86-LABEL: test_mm256_mask_cvtepi32_epi16: 3221; X86: # %bb.0: # %entry 3222; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3223; X86-NEXT: kmovw %eax, %k1 3224; X86-NEXT: vpmovdw %ymm1, %xmm0 {%k1} 3225; X86-NEXT: vzeroupper 3226; X86-NEXT: retl 3227; 3228; X64-LABEL: test_mm256_mask_cvtepi32_epi16: 3229; X64: # %bb.0: # %entry 3230; X64-NEXT: kmovw %edi, %k1 3231; X64-NEXT: vpmovdw %ymm1, %xmm0 {%k1} 3232; X64-NEXT: vzeroupper 3233; X64-NEXT: retq 3234entry: 3235 %0 = bitcast <4 x i64> %__A to <8 x i32> 3236 %1 = bitcast <2 x i64> %__O to <8 x i16> 3237 %2 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> %1, i8 %__M) 3238 %3 = bitcast <8 x i16> %2 to <2 x i64> 3239 ret <2 x i64> %3 3240} 3241 3242define <2 x i64> @test_mm256_maskz_cvtepi32_epi16(i8 zeroext %__M, <4 x i64> %__A) { 3243; X86-LABEL: test_mm256_maskz_cvtepi32_epi16: 3244; X86: # %bb.0: # %entry 3245; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3246; X86-NEXT: kmovw %eax, %k1 3247; X86-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} 3248; X86-NEXT: vzeroupper 3249; X86-NEXT: retl 3250; 3251; X64-LABEL: test_mm256_maskz_cvtepi32_epi16: 3252; X64: # %bb.0: # %entry 3253; X64-NEXT: kmovw %edi, %k1 3254; X64-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} 3255; X64-NEXT: vzeroupper 3256; X64-NEXT: retq 3257entry: 3258 %0 = bitcast <4 x i64> %__A to <8 x i32> 3259 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> zeroinitializer, i8 %__M) 3260 %2 = bitcast <8 x i16> %1 to <2 x i64> 3261 ret <2 x i64> %2 3262} 3263 3264define <2 x i64> @test_mm256_cvtepi64_epi32(<4 x i64> %__A) local_unnamed_addr #0 { 3265; CHECK-LABEL: test_mm256_cvtepi64_epi32: 3266; CHECK: # %bb.0: # %entry 3267; CHECK-NEXT: vpmovqd %ymm0, %xmm0 3268; CHECK-NEXT: vzeroupper 3269; CHECK-NEXT: ret{{[l|q]}} 3270entry: 3271 %conv.i = trunc <4 x i64> %__A to <4 x i32> 3272 %0 = bitcast <4 x i32> %conv.i to <2 x i64> 3273 ret <2 x i64> %0 3274} 3275 3276define <2 x i64> @test_mm256_mask_cvtepi64_epi32(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) { 3277; X86-LABEL: test_mm256_mask_cvtepi64_epi32: 3278; X86: # %bb.0: # %entry 3279; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3280; X86-NEXT: kmovw %eax, %k1 3281; X86-NEXT: vpmovqd %ymm1, %xmm0 {%k1} 3282; X86-NEXT: vzeroupper 3283; X86-NEXT: retl 3284; 3285; X64-LABEL: test_mm256_mask_cvtepi64_epi32: 3286; X64: # %bb.0: # %entry 3287; X64-NEXT: kmovw %edi, %k1 3288; X64-NEXT: vpmovqd %ymm1, %xmm0 {%k1} 3289; X64-NEXT: vzeroupper 3290; X64-NEXT: retq 3291entry: 3292 %conv.i.i = trunc <4 x i64> %__A to <4 x i32> 3293 %0 = bitcast <2 x i64> %__O to <4 x i32> 3294 %1 = bitcast i8 %__M to <8 x i1> 3295 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3296 %2 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> %0 3297 %3 = bitcast <4 x i32> %2 to <2 x i64> 3298 ret <2 x i64> %3 3299} 3300 3301define <2 x i64> @test_mm256_maskz_cvtepi64_epi32(i8 zeroext %__M, <4 x i64> %__A) { 3302; X86-LABEL: test_mm256_maskz_cvtepi64_epi32: 3303; X86: # %bb.0: # %entry 3304; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3305; X86-NEXT: kmovw %eax, %k1 3306; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} 3307; X86-NEXT: vzeroupper 3308; X86-NEXT: retl 3309; 3310; X64-LABEL: test_mm256_maskz_cvtepi64_epi32: 3311; X64: # %bb.0: # %entry 3312; X64-NEXT: kmovw %edi, %k1 3313; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} 3314; X64-NEXT: vzeroupper 3315; X64-NEXT: retq 3316entry: 3317 %conv.i.i = trunc <4 x i64> %__A to <4 x i32> 3318 %0 = bitcast i8 %__M to <8 x i1> 3319 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3320 %1 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> zeroinitializer 3321 %2 = bitcast <4 x i32> %1 to <2 x i64> 3322 ret <2 x i64> %2 3323} 3324 3325define <2 x i64> @test_mm256_cvtepi64_epi8(<4 x i64> %__A) { 3326; CHECK-LABEL: test_mm256_cvtepi64_epi8: 3327; CHECK: # %bb.0: # %entry 3328; CHECK-NEXT: vpmovqb %ymm0, %xmm0 3329; CHECK-NEXT: vzeroupper 3330; CHECK-NEXT: ret{{[l|q]}} 3331entry: 3332 %conv.i = trunc <4 x i64> %__A to <4 x i8> 3333 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 3334 %0 = bitcast <16 x i8> %shuf.i to <2 x i64> 3335 ret <2 x i64> %0 3336} 3337 3338define <2 x i64> @test_mm256_cvtepi64_epi16(<4 x i64> %__A) { 3339; CHECK-LABEL: test_mm256_cvtepi64_epi16: 3340; CHECK: # %bb.0: # %entry 3341; CHECK-NEXT: vpmovqw %ymm0, %xmm0 3342; CHECK-NEXT: vzeroupper 3343; CHECK-NEXT: ret{{[l|q]}} 3344entry: 3345 %conv.i = trunc <4 x i64> %__A to <4 x i16> 3346 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3347 %0 = bitcast <8 x i16> %shuf.i to <2 x i64> 3348 ret <2 x i64> %0 3349} 3350 3351define <2 x i64> @test_mm256_cvtepi32_epi8(<4 x i64> %__A) { 3352; CHECK-LABEL: test_mm256_cvtepi32_epi8: 3353; CHECK: # %bb.0: # %entry 3354; CHECK-NEXT: vpmovdb %ymm0, %xmm0 3355; CHECK-NEXT: vzeroupper 3356; CHECK-NEXT: ret{{[l|q]}} 3357entry: 3358 %0 = bitcast <4 x i64> %__A to <8 x i32> 3359 %conv.i = trunc <8 x i32> %0 to <8 x i8> 3360 %shuf.i = shufflevector <8 x i8> %conv.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3361 %1 = bitcast <16 x i8> %shuf.i to <2 x i64> 3362 ret <2 x i64> %1 3363} 3364 3365define <2 x i64> @test_mm_ternarylogic_epi32(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3366; CHECK-LABEL: test_mm_ternarylogic_epi32: 3367; CHECK: # %bb.0: # %entry 3368; CHECK-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 3369; CHECK-NEXT: ret{{[l|q]}} 3370entry: 3371 %0 = bitcast <2 x i64> %__A to <4 x i32> 3372 %1 = bitcast <2 x i64> %__B to <4 x i32> 3373 %2 = bitcast <2 x i64> %__C to <4 x i32> 3374 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4) 3375 %4 = bitcast <4 x i32> %3 to <2 x i64> 3376 ret <2 x i64> %4 3377} 3378 3379declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) #2 3380 3381define <2 x i64> @test_mm_mask_ternarylogic_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) { 3382; X86-LABEL: test_mm_mask_ternarylogic_epi32: 3383; X86: # %bb.0: # %entry 3384; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3385; X86-NEXT: kmovw %eax, %k1 3386; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} 3387; X86-NEXT: retl 3388; 3389; X64-LABEL: test_mm_mask_ternarylogic_epi32: 3390; X64: # %bb.0: # %entry 3391; X64-NEXT: kmovw %edi, %k1 3392; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} 3393; X64-NEXT: retq 3394entry: 3395 %0 = bitcast <2 x i64> %__A to <4 x i32> 3396 %1 = bitcast <2 x i64> %__B to <4 x i32> 3397 %2 = bitcast <2 x i64> %__C to <4 x i32> 3398 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4) 3399 %4 = bitcast i8 %__U to <8 x i1> 3400 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3401 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> %0 3402 %6 = bitcast <4 x i32> %5 to <2 x i64> 3403 ret <2 x i64> %6 3404} 3405 3406define <2 x i64> @test_mm_maskz_ternarylogic_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3407; X86-LABEL: test_mm_maskz_ternarylogic_epi32: 3408; X86: # %bb.0: # %entry 3409; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3410; X86-NEXT: kmovw %eax, %k1 3411; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3412; X86-NEXT: retl 3413; 3414; X64-LABEL: test_mm_maskz_ternarylogic_epi32: 3415; X64: # %bb.0: # %entry 3416; X64-NEXT: kmovw %edi, %k1 3417; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3418; X64-NEXT: retq 3419entry: 3420 %0 = bitcast <2 x i64> %__A to <4 x i32> 3421 %1 = bitcast <2 x i64> %__B to <4 x i32> 3422 %2 = bitcast <2 x i64> %__C to <4 x i32> 3423 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4) 3424 %4 = bitcast i8 %__U to <8 x i1> 3425 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3426 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> zeroinitializer 3427 %6 = bitcast <4 x i32> %5 to <2 x i64> 3428 ret <2 x i64> %6 3429} 3430 3431define <4 x i64> @test_mm256_ternarylogic_epi32(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3432; CHECK-LABEL: test_mm256_ternarylogic_epi32: 3433; CHECK: # %bb.0: # %entry 3434; CHECK-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 3435; CHECK-NEXT: ret{{[l|q]}} 3436entry: 3437 %0 = bitcast <4 x i64> %__A to <8 x i32> 3438 %1 = bitcast <4 x i64> %__B to <8 x i32> 3439 %2 = bitcast <4 x i64> %__C to <8 x i32> 3440 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4) 3441 %4 = bitcast <8 x i32> %3 to <4 x i64> 3442 ret <4 x i64> %4 3443} 3444 3445declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) #2 3446 3447define <4 x i64> @test_mm256_mask_ternarylogic_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) { 3448; X86-LABEL: test_mm256_mask_ternarylogic_epi32: 3449; X86: # %bb.0: # %entry 3450; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3451; X86-NEXT: kmovw %eax, %k1 3452; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} 3453; X86-NEXT: retl 3454; 3455; X64-LABEL: test_mm256_mask_ternarylogic_epi32: 3456; X64: # %bb.0: # %entry 3457; X64-NEXT: kmovw %edi, %k1 3458; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} 3459; X64-NEXT: retq 3460entry: 3461 %0 = bitcast <4 x i64> %__A to <8 x i32> 3462 %1 = bitcast <4 x i64> %__B to <8 x i32> 3463 %2 = bitcast <4 x i64> %__C to <8 x i32> 3464 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4) 3465 %4 = bitcast i8 %__U to <8 x i1> 3466 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0 3467 %6 = bitcast <8 x i32> %5 to <4 x i64> 3468 ret <4 x i64> %6 3469} 3470 3471define <4 x i64> @test_mm256_maskz_ternarylogic_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3472; X86-LABEL: test_mm256_maskz_ternarylogic_epi32: 3473; X86: # %bb.0: # %entry 3474; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3475; X86-NEXT: kmovw %eax, %k1 3476; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3477; X86-NEXT: retl 3478; 3479; X64-LABEL: test_mm256_maskz_ternarylogic_epi32: 3480; X64: # %bb.0: # %entry 3481; X64-NEXT: kmovw %edi, %k1 3482; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3483; X64-NEXT: retq 3484entry: 3485 %0 = bitcast <4 x i64> %__A to <8 x i32> 3486 %1 = bitcast <4 x i64> %__B to <8 x i32> 3487 %2 = bitcast <4 x i64> %__C to <8 x i32> 3488 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4) 3489 %4 = bitcast i8 %__U to <8 x i1> 3490 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer 3491 %6 = bitcast <8 x i32> %5 to <4 x i64> 3492 ret <4 x i64> %6 3493} 3494 3495define <2 x i64> @test_mm_ternarylogic_epi64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3496; CHECK-LABEL: test_mm_ternarylogic_epi64: 3497; CHECK: # %bb.0: # %entry 3498; CHECK-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 3499; CHECK-NEXT: ret{{[l|q]}} 3500entry: 3501 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) 3502 ret <2 x i64> %0 3503} 3504 3505declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) #2 3506 3507define <2 x i64> @test_mm_mask_ternarylogic_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) { 3508; X86-LABEL: test_mm_mask_ternarylogic_epi64: 3509; X86: # %bb.0: # %entry 3510; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3511; X86-NEXT: kmovw %eax, %k1 3512; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} 3513; X86-NEXT: retl 3514; 3515; X64-LABEL: test_mm_mask_ternarylogic_epi64: 3516; X64: # %bb.0: # %entry 3517; X64-NEXT: kmovw %edi, %k1 3518; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} 3519; X64-NEXT: retq 3520entry: 3521 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) 3522 %1 = bitcast i8 %__U to <8 x i1> 3523 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3524 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__A 3525 ret <2 x i64> %2 3526} 3527 3528define <2 x i64> @test_mm_maskz_ternarylogic_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3529; X86-LABEL: test_mm_maskz_ternarylogic_epi64: 3530; X86: # %bb.0: # %entry 3531; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3532; X86-NEXT: kmovw %eax, %k1 3533; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3534; X86-NEXT: retl 3535; 3536; X64-LABEL: test_mm_maskz_ternarylogic_epi64: 3537; X64: # %bb.0: # %entry 3538; X64-NEXT: kmovw %edi, %k1 3539; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3540; X64-NEXT: retq 3541entry: 3542 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) 3543 %1 = bitcast i8 %__U to <8 x i1> 3544 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3545 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer 3546 ret <2 x i64> %2 3547} 3548 3549define <4 x i64> @test_mm256_ternarylogic_epi64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3550; CHECK-LABEL: test_mm256_ternarylogic_epi64: 3551; CHECK: # %bb.0: # %entry 3552; CHECK-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 3553; CHECK-NEXT: ret{{[l|q]}} 3554entry: 3555 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) 3556 ret <4 x i64> %0 3557} 3558 3559declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) #2 3560 3561define <4 x i64> @test_mm256_mask_ternarylogic_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) { 3562; X86-LABEL: test_mm256_mask_ternarylogic_epi64: 3563; X86: # %bb.0: # %entry 3564; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3565; X86-NEXT: kmovw %eax, %k1 3566; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} 3567; X86-NEXT: retl 3568; 3569; X64-LABEL: test_mm256_mask_ternarylogic_epi64: 3570; X64: # %bb.0: # %entry 3571; X64-NEXT: kmovw %edi, %k1 3572; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} 3573; X64-NEXT: retq 3574entry: 3575 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) 3576 %1 = bitcast i8 %__U to <8 x i1> 3577 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3578 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__A 3579 ret <4 x i64> %2 3580} 3581 3582define <4 x i64> @test_mm256_maskz_ternarylogic_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3583; X86-LABEL: test_mm256_maskz_ternarylogic_epi64: 3584; X86: # %bb.0: # %entry 3585; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3586; X86-NEXT: kmovw %eax, %k1 3587; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3588; X86-NEXT: retl 3589; 3590; X64-LABEL: test_mm256_maskz_ternarylogic_epi64: 3591; X64: # %bb.0: # %entry 3592; X64-NEXT: kmovw %edi, %k1 3593; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3594; X64-NEXT: retq 3595entry: 3596 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) 3597 %1 = bitcast i8 %__U to <8 x i1> 3598 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3599 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer 3600 ret <4 x i64> %2 3601} 3602 3603define <2 x i64> @test_mm_mask2_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) { 3604; X86-LABEL: test_mm_mask2_permutex2var_epi32: 3605; X86: # %bb.0: # %entry 3606; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3607; X86-NEXT: kmovw %eax, %k1 3608; X86-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} 3609; X86-NEXT: vmovdqa %xmm1, %xmm0 3610; X86-NEXT: retl 3611; 3612; X64-LABEL: test_mm_mask2_permutex2var_epi32: 3613; X64: # %bb.0: # %entry 3614; X64-NEXT: kmovw %edi, %k1 3615; X64-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} 3616; X64-NEXT: vmovdqa %xmm1, %xmm0 3617; X64-NEXT: retq 3618entry: 3619 %0 = bitcast <2 x i64> %__A to <4 x i32> 3620 %1 = bitcast <2 x i64> %__I to <4 x i32> 3621 %2 = bitcast <2 x i64> %__B to <4 x i32> 3622 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3623 %4 = bitcast i8 %__U to <8 x i1> 3624 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3625 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %1 3626 %6 = bitcast <4 x i32> %5 to <2 x i64> 3627 ret <2 x i64> %6 3628} 3629 3630define <4 x i64> @test_mm256_mask2_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) { 3631; X86-LABEL: test_mm256_mask2_permutex2var_epi32: 3632; X86: # %bb.0: # %entry 3633; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3634; X86-NEXT: kmovw %eax, %k1 3635; X86-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} 3636; X86-NEXT: vmovdqa %ymm1, %ymm0 3637; X86-NEXT: retl 3638; 3639; X64-LABEL: test_mm256_mask2_permutex2var_epi32: 3640; X64: # %bb.0: # %entry 3641; X64-NEXT: kmovw %edi, %k1 3642; X64-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} 3643; X64-NEXT: vmovdqa %ymm1, %ymm0 3644; X64-NEXT: retq 3645entry: 3646 %0 = bitcast <4 x i64> %__A to <8 x i32> 3647 %1 = bitcast <4 x i64> %__I to <8 x i32> 3648 %2 = bitcast <4 x i64> %__B to <8 x i32> 3649 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3650 %4 = bitcast i8 %__U to <8 x i1> 3651 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %1 3652 %6 = bitcast <8 x i32> %5 to <4 x i64> 3653 ret <4 x i64> %6 3654} 3655 3656define <2 x double> @test_mm_mask2_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x double> %__B) { 3657; X86-LABEL: test_mm_mask2_permutex2var_pd: 3658; X86: # %bb.0: # %entry 3659; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3660; X86-NEXT: kmovw %eax, %k1 3661; X86-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} 3662; X86-NEXT: vmovapd %xmm1, %xmm0 3663; X86-NEXT: retl 3664; 3665; X64-LABEL: test_mm_mask2_permutex2var_pd: 3666; X64: # %bb.0: # %entry 3667; X64-NEXT: kmovw %edi, %k1 3668; X64-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} 3669; X64-NEXT: vmovapd %xmm1, %xmm0 3670; X64-NEXT: retq 3671entry: 3672 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 3673 %1 = bitcast <2 x i64> %__I to <2 x double> 3674 %2 = bitcast i8 %__U to <8 x i1> 3675 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3676 %3 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %1 3677 ret <2 x double> %3 3678} 3679 3680define <4 x double> @test_mm256_mask2_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x double> %__B) { 3681; X86-LABEL: test_mm256_mask2_permutex2var_pd: 3682; X86: # %bb.0: # %entry 3683; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3684; X86-NEXT: kmovw %eax, %k1 3685; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} 3686; X86-NEXT: vmovapd %ymm1, %ymm0 3687; X86-NEXT: retl 3688; 3689; X64-LABEL: test_mm256_mask2_permutex2var_pd: 3690; X64: # %bb.0: # %entry 3691; X64-NEXT: kmovw %edi, %k1 3692; X64-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} 3693; X64-NEXT: vmovapd %ymm1, %ymm0 3694; X64-NEXT: retq 3695entry: 3696 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 3697 %1 = bitcast <4 x i64> %__I to <4 x double> 3698 %2 = bitcast i8 %__U to <8 x i1> 3699 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3700 %3 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %1 3701 ret <4 x double> %3 3702} 3703 3704define <4 x float> @test_mm_mask2_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, i8 zeroext %__U, <4 x float> %__B) { 3705; X86-LABEL: test_mm_mask2_permutex2var_ps: 3706; X86: # %bb.0: # %entry 3707; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3708; X86-NEXT: kmovw %eax, %k1 3709; X86-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} 3710; X86-NEXT: vmovaps %xmm1, %xmm0 3711; X86-NEXT: retl 3712; 3713; X64-LABEL: test_mm_mask2_permutex2var_ps: 3714; X64: # %bb.0: # %entry 3715; X64-NEXT: kmovw %edi, %k1 3716; X64-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} 3717; X64-NEXT: vmovaps %xmm1, %xmm0 3718; X64-NEXT: retq 3719entry: 3720 %0 = bitcast <2 x i64> %__I to <4 x i32> 3721 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 3722 %2 = bitcast <2 x i64> %__I to <4 x float> 3723 %3 = bitcast i8 %__U to <8 x i1> 3724 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3725 %4 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %2 3726 ret <4 x float> %4 3727} 3728 3729define <8 x float> @test_mm256_mask2_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, i8 zeroext %__U, <8 x float> %__B) { 3730; X86-LABEL: test_mm256_mask2_permutex2var_ps: 3731; X86: # %bb.0: # %entry 3732; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3733; X86-NEXT: kmovw %eax, %k1 3734; X86-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} 3735; X86-NEXT: vmovaps %ymm1, %ymm0 3736; X86-NEXT: retl 3737; 3738; X64-LABEL: test_mm256_mask2_permutex2var_ps: 3739; X64: # %bb.0: # %entry 3740; X64-NEXT: kmovw %edi, %k1 3741; X64-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} 3742; X64-NEXT: vmovaps %ymm1, %ymm0 3743; X64-NEXT: retq 3744entry: 3745 %0 = bitcast <4 x i64> %__I to <8 x i32> 3746 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 3747 %2 = bitcast <4 x i64> %__I to <8 x float> 3748 %3 = bitcast i8 %__U to <8 x i1> 3749 %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2 3750 ret <8 x float> %4 3751} 3752 3753define <2 x i64> @test_mm_mask2_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) { 3754; X86-LABEL: test_mm_mask2_permutex2var_epi64: 3755; X86: # %bb.0: # %entry 3756; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3757; X86-NEXT: kmovw %eax, %k1 3758; X86-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} 3759; X86-NEXT: vmovdqa %xmm1, %xmm0 3760; X86-NEXT: retl 3761; 3762; X64-LABEL: test_mm_mask2_permutex2var_epi64: 3763; X64: # %bb.0: # %entry 3764; X64-NEXT: kmovw %edi, %k1 3765; X64-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} 3766; X64-NEXT: vmovdqa %xmm1, %xmm0 3767; X64-NEXT: retq 3768entry: 3769 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 3770 %1 = bitcast i8 %__U to <8 x i1> 3771 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3772 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__I 3773 ret <2 x i64> %2 3774} 3775 3776define <4 x i64> @test_mm256_mask2_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) { 3777; X86-LABEL: test_mm256_mask2_permutex2var_epi64: 3778; X86: # %bb.0: # %entry 3779; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3780; X86-NEXT: kmovw %eax, %k1 3781; X86-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} 3782; X86-NEXT: vmovdqa %ymm1, %ymm0 3783; X86-NEXT: retl 3784; 3785; X64-LABEL: test_mm256_mask2_permutex2var_epi64: 3786; X64: # %bb.0: # %entry 3787; X64-NEXT: kmovw %edi, %k1 3788; X64-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} 3789; X64-NEXT: vmovdqa %ymm1, %ymm0 3790; X64-NEXT: retq 3791entry: 3792 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 3793 %1 = bitcast i8 %__U to <8 x i1> 3794 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3795 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__I 3796 ret <4 x i64> %2 3797} 3798 3799define <2 x i64> @test_mm_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 3800; CHECK-LABEL: test_mm_permutex2var_epi32: 3801; CHECK: # %bb.0: # %entry 3802; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 3803; CHECK-NEXT: ret{{[l|q]}} 3804entry: 3805 %0 = bitcast <2 x i64> %__A to <4 x i32> 3806 %1 = bitcast <2 x i64> %__I to <4 x i32> 3807 %2 = bitcast <2 x i64> %__B to <4 x i32> 3808 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3809 %4 = bitcast <4 x i32> %3 to <2 x i64> 3810 ret <2 x i64> %4 3811} 3812 3813define <2 x i64> @test_mm_mask_permutex2var_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) { 3814; X86-LABEL: test_mm_mask_permutex2var_epi32: 3815; X86: # %bb.0: # %entry 3816; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3817; X86-NEXT: kmovw %eax, %k1 3818; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} 3819; X86-NEXT: retl 3820; 3821; X64-LABEL: test_mm_mask_permutex2var_epi32: 3822; X64: # %bb.0: # %entry 3823; X64-NEXT: kmovw %edi, %k1 3824; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} 3825; X64-NEXT: retq 3826entry: 3827 %0 = bitcast <2 x i64> %__A to <4 x i32> 3828 %1 = bitcast <2 x i64> %__I to <4 x i32> 3829 %2 = bitcast <2 x i64> %__B to <4 x i32> 3830 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3831 %4 = bitcast i8 %__U to <8 x i1> 3832 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3833 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0 3834 %6 = bitcast <4 x i32> %5 to <2 x i64> 3835 ret <2 x i64> %6 3836} 3837 3838define <2 x i64> @test_mm_maskz_permutex2var_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 3839; X86-LABEL: test_mm_maskz_permutex2var_epi32: 3840; X86: # %bb.0: # %entry 3841; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3842; X86-NEXT: kmovw %eax, %k1 3843; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z} 3844; X86-NEXT: retl 3845; 3846; X64-LABEL: test_mm_maskz_permutex2var_epi32: 3847; X64: # %bb.0: # %entry 3848; X64-NEXT: kmovw %edi, %k1 3849; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z} 3850; X64-NEXT: retq 3851entry: 3852 %0 = bitcast <2 x i64> %__A to <4 x i32> 3853 %1 = bitcast <2 x i64> %__I to <4 x i32> 3854 %2 = bitcast <2 x i64> %__B to <4 x i32> 3855 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3856 %4 = bitcast i8 %__U to <8 x i1> 3857 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3858 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer 3859 %6 = bitcast <4 x i32> %5 to <2 x i64> 3860 ret <2 x i64> %6 3861} 3862 3863define <4 x i64> @test_mm256_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 3864; CHECK-LABEL: test_mm256_permutex2var_epi32: 3865; CHECK: # %bb.0: # %entry 3866; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 3867; CHECK-NEXT: ret{{[l|q]}} 3868entry: 3869 %0 = bitcast <4 x i64> %__A to <8 x i32> 3870 %1 = bitcast <4 x i64> %__I to <8 x i32> 3871 %2 = bitcast <4 x i64> %__B to <8 x i32> 3872 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3873 %4 = bitcast <8 x i32> %3 to <4 x i64> 3874 ret <4 x i64> %4 3875} 3876 3877define <4 x i64> @test_mm256_mask_permutex2var_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) { 3878; X86-LABEL: test_mm256_mask_permutex2var_epi32: 3879; X86: # %bb.0: # %entry 3880; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3881; X86-NEXT: kmovw %eax, %k1 3882; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} 3883; X86-NEXT: retl 3884; 3885; X64-LABEL: test_mm256_mask_permutex2var_epi32: 3886; X64: # %bb.0: # %entry 3887; X64-NEXT: kmovw %edi, %k1 3888; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} 3889; X64-NEXT: retq 3890entry: 3891 %0 = bitcast <4 x i64> %__A to <8 x i32> 3892 %1 = bitcast <4 x i64> %__I to <8 x i32> 3893 %2 = bitcast <4 x i64> %__B to <8 x i32> 3894 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3895 %4 = bitcast i8 %__U to <8 x i1> 3896 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0 3897 %6 = bitcast <8 x i32> %5 to <4 x i64> 3898 ret <4 x i64> %6 3899} 3900 3901define <4 x i64> @test_mm256_maskz_permutex2var_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 3902; X86-LABEL: test_mm256_maskz_permutex2var_epi32: 3903; X86: # %bb.0: # %entry 3904; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3905; X86-NEXT: kmovw %eax, %k1 3906; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z} 3907; X86-NEXT: retl 3908; 3909; X64-LABEL: test_mm256_maskz_permutex2var_epi32: 3910; X64: # %bb.0: # %entry 3911; X64-NEXT: kmovw %edi, %k1 3912; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z} 3913; X64-NEXT: retq 3914entry: 3915 %0 = bitcast <4 x i64> %__A to <8 x i32> 3916 %1 = bitcast <4 x i64> %__I to <8 x i32> 3917 %2 = bitcast <4 x i64> %__B to <8 x i32> 3918 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3919 %4 = bitcast i8 %__U to <8 x i1> 3920 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer 3921 %6 = bitcast <8 x i32> %5 to <4 x i64> 3922 ret <4 x i64> %6 3923} 3924 3925define <2 x double> @test_mm_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) { 3926; CHECK-LABEL: test_mm_permutex2var_pd: 3927; CHECK: # %bb.0: # %entry 3928; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 3929; CHECK-NEXT: ret{{[l|q]}} 3930entry: 3931 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 3932 ret <2 x double> %0 3933} 3934 3935define <2 x double> @test_mm_mask_permutex2var_pd(<2 x double> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x double> %__B) { 3936; X86-LABEL: test_mm_mask_permutex2var_pd: 3937; X86: # %bb.0: # %entry 3938; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3939; X86-NEXT: kmovw %eax, %k1 3940; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} 3941; X86-NEXT: retl 3942; 3943; X64-LABEL: test_mm_mask_permutex2var_pd: 3944; X64: # %bb.0: # %entry 3945; X64-NEXT: kmovw %edi, %k1 3946; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} 3947; X64-NEXT: retq 3948entry: 3949 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 3950 %1 = bitcast i8 %__U to <8 x i1> 3951 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3952 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 3953 ret <2 x double> %2 3954} 3955 3956define <2 x double> @test_mm_maskz_permutex2var_pd(i8 zeroext %__U, <2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) { 3957; X86-LABEL: test_mm_maskz_permutex2var_pd: 3958; X86: # %bb.0: # %entry 3959; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3960; X86-NEXT: kmovw %eax, %k1 3961; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z} 3962; X86-NEXT: retl 3963; 3964; X64-LABEL: test_mm_maskz_permutex2var_pd: 3965; X64: # %bb.0: # %entry 3966; X64-NEXT: kmovw %edi, %k1 3967; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z} 3968; X64-NEXT: retq 3969entry: 3970 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 3971 %1 = bitcast i8 %__U to <8 x i1> 3972 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3973 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 3974 ret <2 x double> %2 3975} 3976 3977define <4 x double> @test_mm256_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) { 3978; CHECK-LABEL: test_mm256_permutex2var_pd: 3979; CHECK: # %bb.0: # %entry 3980; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 3981; CHECK-NEXT: ret{{[l|q]}} 3982entry: 3983 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 3984 ret <4 x double> %0 3985} 3986 3987define <4 x double> @test_mm256_mask_permutex2var_pd(<4 x double> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x double> %__B) { 3988; X86-LABEL: test_mm256_mask_permutex2var_pd: 3989; X86: # %bb.0: # %entry 3990; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3991; X86-NEXT: kmovw %eax, %k1 3992; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} 3993; X86-NEXT: retl 3994; 3995; X64-LABEL: test_mm256_mask_permutex2var_pd: 3996; X64: # %bb.0: # %entry 3997; X64-NEXT: kmovw %edi, %k1 3998; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} 3999; X64-NEXT: retq 4000entry: 4001 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 4002 %1 = bitcast i8 %__U to <8 x i1> 4003 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4004 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 4005 ret <4 x double> %2 4006} 4007 4008define <4 x double> @test_mm256_maskz_permutex2var_pd(i8 zeroext %__U, <4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) { 4009; X86-LABEL: test_mm256_maskz_permutex2var_pd: 4010; X86: # %bb.0: # %entry 4011; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4012; X86-NEXT: kmovw %eax, %k1 4013; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z} 4014; X86-NEXT: retl 4015; 4016; X64-LABEL: test_mm256_maskz_permutex2var_pd: 4017; X64: # %bb.0: # %entry 4018; X64-NEXT: kmovw %edi, %k1 4019; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z} 4020; X64-NEXT: retq 4021entry: 4022 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 4023 %1 = bitcast i8 %__U to <8 x i1> 4024 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4025 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4026 ret <4 x double> %2 4027} 4028 4029define <4 x float> @test_mm_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) { 4030; CHECK-LABEL: test_mm_permutex2var_ps: 4031; CHECK: # %bb.0: # %entry 4032; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 4033; CHECK-NEXT: ret{{[l|q]}} 4034entry: 4035 %0 = bitcast <2 x i64> %__I to <4 x i32> 4036 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 4037 ret <4 x float> %1 4038} 4039 4040define <4 x float> @test_mm_mask_permutex2var_ps(<4 x float> %__A, i8 zeroext %__U, <2 x i64> %__I, <4 x float> %__B) { 4041; X86-LABEL: test_mm_mask_permutex2var_ps: 4042; X86: # %bb.0: # %entry 4043; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4044; X86-NEXT: kmovw %eax, %k1 4045; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} 4046; X86-NEXT: retl 4047; 4048; X64-LABEL: test_mm_mask_permutex2var_ps: 4049; X64: # %bb.0: # %entry 4050; X64-NEXT: kmovw %edi, %k1 4051; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} 4052; X64-NEXT: retq 4053entry: 4054 %0 = bitcast <2 x i64> %__I to <4 x i32> 4055 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 4056 %2 = bitcast i8 %__U to <8 x i1> 4057 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4058 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %__A 4059 ret <4 x float> %3 4060} 4061 4062define <4 x float> @test_mm_maskz_permutex2var_ps(i8 zeroext %__U, <4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) { 4063; X86-LABEL: test_mm_maskz_permutex2var_ps: 4064; X86: # %bb.0: # %entry 4065; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4066; X86-NEXT: kmovw %eax, %k1 4067; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z} 4068; X86-NEXT: retl 4069; 4070; X64-LABEL: test_mm_maskz_permutex2var_ps: 4071; X64: # %bb.0: # %entry 4072; X64-NEXT: kmovw %edi, %k1 4073; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z} 4074; X64-NEXT: retq 4075entry: 4076 %0 = bitcast <2 x i64> %__I to <4 x i32> 4077 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 4078 %2 = bitcast i8 %__U to <8 x i1> 4079 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4080 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> zeroinitializer 4081 ret <4 x float> %3 4082} 4083 4084define <8 x float> @test_mm256_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) { 4085; CHECK-LABEL: test_mm256_permutex2var_ps: 4086; CHECK: # %bb.0: # %entry 4087; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 4088; CHECK-NEXT: ret{{[l|q]}} 4089entry: 4090 %0 = bitcast <4 x i64> %__I to <8 x i32> 4091 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 4092 ret <8 x float> %1 4093} 4094 4095define <8 x float> @test_mm256_mask_permutex2var_ps(<8 x float> %__A, i8 zeroext %__U, <4 x i64> %__I, <8 x float> %__B) { 4096; X86-LABEL: test_mm256_mask_permutex2var_ps: 4097; X86: # %bb.0: # %entry 4098; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4099; X86-NEXT: kmovw %eax, %k1 4100; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} 4101; X86-NEXT: retl 4102; 4103; X64-LABEL: test_mm256_mask_permutex2var_ps: 4104; X64: # %bb.0: # %entry 4105; X64-NEXT: kmovw %edi, %k1 4106; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} 4107; X64-NEXT: retq 4108entry: 4109 %0 = bitcast <4 x i64> %__I to <8 x i32> 4110 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 4111 %2 = bitcast i8 %__U to <8 x i1> 4112 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %__A 4113 ret <8 x float> %3 4114} 4115 4116define <8 x float> @test_mm256_maskz_permutex2var_ps(i8 zeroext %__U, <8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) { 4117; X86-LABEL: test_mm256_maskz_permutex2var_ps: 4118; X86: # %bb.0: # %entry 4119; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4120; X86-NEXT: kmovw %eax, %k1 4121; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z} 4122; X86-NEXT: retl 4123; 4124; X64-LABEL: test_mm256_maskz_permutex2var_ps: 4125; X64: # %bb.0: # %entry 4126; X64-NEXT: kmovw %edi, %k1 4127; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z} 4128; X64-NEXT: retq 4129entry: 4130 %0 = bitcast <4 x i64> %__I to <8 x i32> 4131 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 4132 %2 = bitcast i8 %__U to <8 x i1> 4133 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer 4134 ret <8 x float> %3 4135} 4136 4137define <2 x i64> @test_mm_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 4138; CHECK-LABEL: test_mm_permutex2var_epi64: 4139; CHECK: # %bb.0: # %entry 4140; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 4141; CHECK-NEXT: ret{{[l|q]}} 4142entry: 4143 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 4144 ret <2 x i64> %0 4145} 4146 4147define <2 x i64> @test_mm_mask_permutex2var_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) { 4148; X86-LABEL: test_mm_mask_permutex2var_epi64: 4149; X86: # %bb.0: # %entry 4150; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4151; X86-NEXT: kmovw %eax, %k1 4152; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} 4153; X86-NEXT: retl 4154; 4155; X64-LABEL: test_mm_mask_permutex2var_epi64: 4156; X64: # %bb.0: # %entry 4157; X64-NEXT: kmovw %edi, %k1 4158; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} 4159; X64-NEXT: retq 4160entry: 4161 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 4162 %1 = bitcast i8 %__U to <8 x i1> 4163 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4164 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__A 4165 ret <2 x i64> %2 4166} 4167 4168define <2 x i64> @test_mm_maskz_permutex2var_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 4169; X86-LABEL: test_mm_maskz_permutex2var_epi64: 4170; X86: # %bb.0: # %entry 4171; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4172; X86-NEXT: kmovw %eax, %k1 4173; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z} 4174; X86-NEXT: retl 4175; 4176; X64-LABEL: test_mm_maskz_permutex2var_epi64: 4177; X64: # %bb.0: # %entry 4178; X64-NEXT: kmovw %edi, %k1 4179; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z} 4180; X64-NEXT: retq 4181entry: 4182 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 4183 %1 = bitcast i8 %__U to <8 x i1> 4184 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4185 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer 4186 ret <2 x i64> %2 4187} 4188 4189define <4 x i64> @test_mm256_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 4190; CHECK-LABEL: test_mm256_permutex2var_epi64: 4191; CHECK: # %bb.0: # %entry 4192; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 4193; CHECK-NEXT: ret{{[l|q]}} 4194entry: 4195 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 4196 ret <4 x i64> %0 4197} 4198 4199define <4 x i64> @test_mm256_mask_permutex2var_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) { 4200; X86-LABEL: test_mm256_mask_permutex2var_epi64: 4201; X86: # %bb.0: # %entry 4202; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4203; X86-NEXT: kmovw %eax, %k1 4204; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} 4205; X86-NEXT: retl 4206; 4207; X64-LABEL: test_mm256_mask_permutex2var_epi64: 4208; X64: # %bb.0: # %entry 4209; X64-NEXT: kmovw %edi, %k1 4210; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} 4211; X64-NEXT: retq 4212entry: 4213 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 4214 %1 = bitcast i8 %__U to <8 x i1> 4215 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4216 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__A 4217 ret <4 x i64> %2 4218} 4219 4220define <4 x i64> @test_mm256_maskz_permutex2var_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 4221; X86-LABEL: test_mm256_maskz_permutex2var_epi64: 4222; X86: # %bb.0: # %entry 4223; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4224; X86-NEXT: kmovw %eax, %k1 4225; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z} 4226; X86-NEXT: retl 4227; 4228; X64-LABEL: test_mm256_maskz_permutex2var_epi64: 4229; X64: # %bb.0: # %entry 4230; X64-NEXT: kmovw %edi, %k1 4231; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z} 4232; X64-NEXT: retq 4233entry: 4234 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 4235 %1 = bitcast i8 %__U to <8 x i1> 4236 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4237 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer 4238 ret <4 x i64> %2 4239} 4240 4241 4242define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 4243; X86-LABEL: test_mm_mask_fmadd_pd: 4244; X86: # %bb.0: # %entry 4245; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4246; X86-NEXT: kmovw %eax, %k1 4247; X86-NEXT: vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 4248; X86-NEXT: retl 4249; 4250; X64-LABEL: test_mm_mask_fmadd_pd: 4251; X64: # %bb.0: # %entry 4252; X64-NEXT: kmovw %edi, %k1 4253; X64-NEXT: vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 4254; X64-NEXT: retq 4255entry: 4256 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4257 %1 = bitcast i8 %__U to <8 x i1> 4258 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4259 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 4260 ret <2 x double> %2 4261} 4262 4263define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 4264; X86-LABEL: test_mm_mask_fmsub_pd: 4265; X86: # %bb.0: # %entry 4266; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4267; X86-NEXT: kmovw %eax, %k1 4268; X86-NEXT: vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 4269; X86-NEXT: retl 4270; 4271; X64-LABEL: test_mm_mask_fmsub_pd: 4272; X64: # %bb.0: # %entry 4273; X64-NEXT: kmovw %edi, %k1 4274; X64-NEXT: vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 4275; X64-NEXT: retq 4276entry: 4277 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4278 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 4279 %1 = bitcast i8 %__U to <8 x i1> 4280 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4281 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 4282 ret <2 x double> %2 4283} 4284 4285define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 4286; X86-LABEL: test_mm_mask3_fmadd_pd: 4287; X86: # %bb.0: # %entry 4288; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4289; X86-NEXT: kmovw %eax, %k1 4290; X86-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4291; X86-NEXT: vmovapd %xmm2, %xmm0 4292; X86-NEXT: retl 4293; 4294; X64-LABEL: test_mm_mask3_fmadd_pd: 4295; X64: # %bb.0: # %entry 4296; X64-NEXT: kmovw %edi, %k1 4297; X64-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4298; X64-NEXT: vmovapd %xmm2, %xmm0 4299; X64-NEXT: retq 4300entry: 4301 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4302 %1 = bitcast i8 %__U to <8 x i1> 4303 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4304 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 4305 ret <2 x double> %2 4306} 4307 4308define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 4309; X86-LABEL: test_mm_mask3_fnmadd_pd: 4310; X86: # %bb.0: # %entry 4311; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4312; X86-NEXT: kmovw %eax, %k1 4313; X86-NEXT: vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 4314; X86-NEXT: vmovapd %xmm2, %xmm0 4315; X86-NEXT: retl 4316; 4317; X64-LABEL: test_mm_mask3_fnmadd_pd: 4318; X64: # %bb.0: # %entry 4319; X64-NEXT: kmovw %edi, %k1 4320; X64-NEXT: vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 4321; X64-NEXT: vmovapd %xmm2, %xmm0 4322; X64-NEXT: retq 4323entry: 4324 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 4325 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9 4326 %1 = bitcast i8 %__U to <8 x i1> 4327 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4328 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 4329 ret <2 x double> %2 4330} 4331 4332define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4333; X86-LABEL: test_mm_maskz_fmadd_pd: 4334; X86: # %bb.0: # %entry 4335; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4336; X86-NEXT: kmovw %eax, %k1 4337; X86-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4338; X86-NEXT: retl 4339; 4340; X64-LABEL: test_mm_maskz_fmadd_pd: 4341; X64: # %bb.0: # %entry 4342; X64-NEXT: kmovw %edi, %k1 4343; X64-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4344; X64-NEXT: retq 4345entry: 4346 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4347 %1 = bitcast i8 %__U to <8 x i1> 4348 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4349 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4350 ret <2 x double> %2 4351} 4352 4353define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4354; X86-LABEL: test_mm_maskz_fmsub_pd: 4355; X86: # %bb.0: # %entry 4356; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4357; X86-NEXT: kmovw %eax, %k1 4358; X86-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 4359; X86-NEXT: retl 4360; 4361; X64-LABEL: test_mm_maskz_fmsub_pd: 4362; X64: # %bb.0: # %entry 4363; X64-NEXT: kmovw %edi, %k1 4364; X64-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 4365; X64-NEXT: retq 4366entry: 4367 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4368 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 4369 %1 = bitcast i8 %__U to <8 x i1> 4370 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4371 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4372 ret <2 x double> %2 4373} 4374 4375define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4376; X86-LABEL: test_mm_maskz_fnmadd_pd: 4377; X86: # %bb.0: # %entry 4378; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4379; X86-NEXT: kmovw %eax, %k1 4380; X86-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 4381; X86-NEXT: retl 4382; 4383; X64-LABEL: test_mm_maskz_fnmadd_pd: 4384; X64: # %bb.0: # %entry 4385; X64-NEXT: kmovw %edi, %k1 4386; X64-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 4387; X64-NEXT: retq 4388entry: 4389 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 4390 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9 4391 %1 = bitcast i8 %__U to <8 x i1> 4392 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4393 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4394 ret <2 x double> %2 4395} 4396 4397define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4398; X86-LABEL: test_mm_maskz_fnmsub_pd: 4399; X86: # %bb.0: # %entry 4400; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4401; X86-NEXT: kmovw %eax, %k1 4402; X86-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 4403; X86-NEXT: retl 4404; 4405; X64-LABEL: test_mm_maskz_fnmsub_pd: 4406; X64: # %bb.0: # %entry 4407; X64-NEXT: kmovw %edi, %k1 4408; X64-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 4409; X64-NEXT: retq 4410entry: 4411 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 4412 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4413 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9 4414 %1 = bitcast i8 %__U to <8 x i1> 4415 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4416 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4417 ret <2 x double> %2 4418} 4419 4420define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 4421; X86-LABEL: test_mm256_mask_fmadd_pd: 4422; X86: # %bb.0: # %entry 4423; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4424; X86-NEXT: kmovw %eax, %k1 4425; X86-NEXT: vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 4426; X86-NEXT: retl 4427; 4428; X64-LABEL: test_mm256_mask_fmadd_pd: 4429; X64: # %bb.0: # %entry 4430; X64-NEXT: kmovw %edi, %k1 4431; X64-NEXT: vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 4432; X64-NEXT: retq 4433entry: 4434 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 4435 %1 = bitcast i8 %__U to <8 x i1> 4436 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4437 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 4438 ret <4 x double> %2 4439} 4440 4441define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 4442; X86-LABEL: test_mm256_mask_fmsub_pd: 4443; X86: # %bb.0: # %entry 4444; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4445; X86-NEXT: kmovw %eax, %k1 4446; X86-NEXT: vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 4447; X86-NEXT: retl 4448; 4449; X64-LABEL: test_mm256_mask_fmsub_pd: 4450; X64: # %bb.0: # %entry 4451; X64-NEXT: kmovw %edi, %k1 4452; X64-NEXT: vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 4453; X64-NEXT: retq 4454entry: 4455 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4456 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 4457 %1 = bitcast i8 %__U to <8 x i1> 4458 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4459 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 4460 ret <4 x double> %2 4461} 4462 4463define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 4464; X86-LABEL: test_mm256_mask3_fmadd_pd: 4465; X86: # %bb.0: # %entry 4466; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4467; X86-NEXT: kmovw %eax, %k1 4468; X86-NEXT: vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 4469; X86-NEXT: vmovapd %ymm2, %ymm0 4470; X86-NEXT: retl 4471; 4472; X64-LABEL: test_mm256_mask3_fmadd_pd: 4473; X64: # %bb.0: # %entry 4474; X64-NEXT: kmovw %edi, %k1 4475; X64-NEXT: vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 4476; X64-NEXT: vmovapd %ymm2, %ymm0 4477; X64-NEXT: retq 4478entry: 4479 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 4480 %1 = bitcast i8 %__U to <8 x i1> 4481 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4482 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 4483 ret <4 x double> %2 4484} 4485 4486define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 4487; X86-LABEL: test_mm256_mask3_fnmadd_pd: 4488; X86: # %bb.0: # %entry 4489; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4490; X86-NEXT: kmovw %eax, %k1 4491; X86-NEXT: vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 4492; X86-NEXT: vmovapd %ymm2, %ymm0 4493; X86-NEXT: retl 4494; 4495; X64-LABEL: test_mm256_mask3_fnmadd_pd: 4496; X64: # %bb.0: # %entry 4497; X64-NEXT: kmovw %edi, %k1 4498; X64-NEXT: vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 4499; X64-NEXT: vmovapd %ymm2, %ymm0 4500; X64-NEXT: retq 4501entry: 4502 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4503 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9 4504 %1 = bitcast i8 %__U to <8 x i1> 4505 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4506 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 4507 ret <4 x double> %2 4508} 4509 4510define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4511; X86-LABEL: test_mm256_maskz_fmadd_pd: 4512; X86: # %bb.0: # %entry 4513; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4514; X86-NEXT: kmovw %eax, %k1 4515; X86-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 4516; X86-NEXT: retl 4517; 4518; X64-LABEL: test_mm256_maskz_fmadd_pd: 4519; X64: # %bb.0: # %entry 4520; X64-NEXT: kmovw %edi, %k1 4521; X64-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 4522; X64-NEXT: retq 4523entry: 4524 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 4525 %1 = bitcast i8 %__U to <8 x i1> 4526 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4527 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4528 ret <4 x double> %2 4529} 4530 4531define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4532; X86-LABEL: test_mm256_maskz_fmsub_pd: 4533; X86: # %bb.0: # %entry 4534; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4535; X86-NEXT: kmovw %eax, %k1 4536; X86-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 4537; X86-NEXT: retl 4538; 4539; X64-LABEL: test_mm256_maskz_fmsub_pd: 4540; X64: # %bb.0: # %entry 4541; X64-NEXT: kmovw %edi, %k1 4542; X64-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 4543; X64-NEXT: retq 4544entry: 4545 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4546 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 4547 %1 = bitcast i8 %__U to <8 x i1> 4548 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4549 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4550 ret <4 x double> %2 4551} 4552 4553define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4554; X86-LABEL: test_mm256_maskz_fnmadd_pd: 4555; X86: # %bb.0: # %entry 4556; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4557; X86-NEXT: kmovw %eax, %k1 4558; X86-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 4559; X86-NEXT: retl 4560; 4561; X64-LABEL: test_mm256_maskz_fnmadd_pd: 4562; X64: # %bb.0: # %entry 4563; X64-NEXT: kmovw %edi, %k1 4564; X64-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 4565; X64-NEXT: retq 4566entry: 4567 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4568 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9 4569 %1 = bitcast i8 %__U to <8 x i1> 4570 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4571 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4572 ret <4 x double> %2 4573} 4574 4575define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4576; X86-LABEL: test_mm256_maskz_fnmsub_pd: 4577; X86: # %bb.0: # %entry 4578; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4579; X86-NEXT: kmovw %eax, %k1 4580; X86-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 4581; X86-NEXT: retl 4582; 4583; X64-LABEL: test_mm256_maskz_fnmsub_pd: 4584; X64: # %bb.0: # %entry 4585; X64-NEXT: kmovw %edi, %k1 4586; X64-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 4587; X64-NEXT: retq 4588entry: 4589 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4590 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4591 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9 4592 %1 = bitcast i8 %__U to <8 x i1> 4593 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4594 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4595 ret <4 x double> %2 4596} 4597 4598define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 4599; X86-LABEL: test_mm_mask_fmadd_ps: 4600; X86: # %bb.0: # %entry 4601; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4602; X86-NEXT: kmovw %eax, %k1 4603; X86-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 4604; X86-NEXT: retl 4605; 4606; X64-LABEL: test_mm_mask_fmadd_ps: 4607; X64: # %bb.0: # %entry 4608; X64-NEXT: kmovw %edi, %k1 4609; X64-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 4610; X64-NEXT: retq 4611entry: 4612 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 4613 %1 = bitcast i8 %__U to <8 x i1> 4614 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4615 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 4616 ret <4 x float> %2 4617} 4618 4619define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 4620; X86-LABEL: test_mm_mask_fmsub_ps: 4621; X86: # %bb.0: # %entry 4622; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4623; X86-NEXT: kmovw %eax, %k1 4624; X86-NEXT: vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 4625; X86-NEXT: retl 4626; 4627; X64-LABEL: test_mm_mask_fmsub_ps: 4628; X64: # %bb.0: # %entry 4629; X64-NEXT: kmovw %edi, %k1 4630; X64-NEXT: vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 4631; X64-NEXT: retq 4632entry: 4633 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4634 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 4635 %1 = bitcast i8 %__U to <8 x i1> 4636 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4637 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 4638 ret <4 x float> %2 4639} 4640 4641define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 4642; X86-LABEL: test_mm_mask3_fmadd_ps: 4643; X86: # %bb.0: # %entry 4644; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4645; X86-NEXT: kmovw %eax, %k1 4646; X86-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4647; X86-NEXT: vmovaps %xmm2, %xmm0 4648; X86-NEXT: retl 4649; 4650; X64-LABEL: test_mm_mask3_fmadd_ps: 4651; X64: # %bb.0: # %entry 4652; X64-NEXT: kmovw %edi, %k1 4653; X64-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4654; X64-NEXT: vmovaps %xmm2, %xmm0 4655; X64-NEXT: retq 4656entry: 4657 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 4658 %1 = bitcast i8 %__U to <8 x i1> 4659 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4660 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 4661 ret <4 x float> %2 4662} 4663 4664define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 4665; X86-LABEL: test_mm_mask3_fnmadd_ps: 4666; X86: # %bb.0: # %entry 4667; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4668; X86-NEXT: kmovw %eax, %k1 4669; X86-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 4670; X86-NEXT: vmovaps %xmm2, %xmm0 4671; X86-NEXT: retl 4672; 4673; X64-LABEL: test_mm_mask3_fnmadd_ps: 4674; X64: # %bb.0: # %entry 4675; X64-NEXT: kmovw %edi, %k1 4676; X64-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 4677; X64-NEXT: vmovaps %xmm2, %xmm0 4678; X64-NEXT: retq 4679entry: 4680 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4681 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9 4682 %1 = bitcast i8 %__U to <8 x i1> 4683 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4684 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 4685 ret <4 x float> %2 4686} 4687 4688define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4689; X86-LABEL: test_mm_maskz_fmadd_ps: 4690; X86: # %bb.0: # %entry 4691; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4692; X86-NEXT: kmovw %eax, %k1 4693; X86-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4694; X86-NEXT: retl 4695; 4696; X64-LABEL: test_mm_maskz_fmadd_ps: 4697; X64: # %bb.0: # %entry 4698; X64-NEXT: kmovw %edi, %k1 4699; X64-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4700; X64-NEXT: retq 4701entry: 4702 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 4703 %1 = bitcast i8 %__U to <8 x i1> 4704 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4705 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4706 ret <4 x float> %2 4707} 4708 4709define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4710; X86-LABEL: test_mm_maskz_fmsub_ps: 4711; X86: # %bb.0: # %entry 4712; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4713; X86-NEXT: kmovw %eax, %k1 4714; X86-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 4715; X86-NEXT: retl 4716; 4717; X64-LABEL: test_mm_maskz_fmsub_ps: 4718; X64: # %bb.0: # %entry 4719; X64-NEXT: kmovw %edi, %k1 4720; X64-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 4721; X64-NEXT: retq 4722entry: 4723 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4724 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 4725 %1 = bitcast i8 %__U to <8 x i1> 4726 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4727 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4728 ret <4 x float> %2 4729} 4730 4731define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4732; X86-LABEL: test_mm_maskz_fnmadd_ps: 4733; X86: # %bb.0: # %entry 4734; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4735; X86-NEXT: kmovw %eax, %k1 4736; X86-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 4737; X86-NEXT: retl 4738; 4739; X64-LABEL: test_mm_maskz_fnmadd_ps: 4740; X64: # %bb.0: # %entry 4741; X64-NEXT: kmovw %edi, %k1 4742; X64-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 4743; X64-NEXT: retq 4744entry: 4745 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4746 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9 4747 %1 = bitcast i8 %__U to <8 x i1> 4748 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4749 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4750 ret <4 x float> %2 4751} 4752 4753define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4754; X86-LABEL: test_mm_maskz_fnmsub_ps: 4755; X86: # %bb.0: # %entry 4756; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4757; X86-NEXT: kmovw %eax, %k1 4758; X86-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 4759; X86-NEXT: retl 4760; 4761; X64-LABEL: test_mm_maskz_fnmsub_ps: 4762; X64: # %bb.0: # %entry 4763; X64-NEXT: kmovw %edi, %k1 4764; X64-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 4765; X64-NEXT: retq 4766entry: 4767 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4768 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4769 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9 4770 %1 = bitcast i8 %__U to <8 x i1> 4771 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4772 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4773 ret <4 x float> %2 4774} 4775 4776define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 4777; X86-LABEL: test_mm256_mask_fmadd_ps: 4778; X86: # %bb.0: # %entry 4779; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4780; X86-NEXT: kmovw %eax, %k1 4781; X86-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 4782; X86-NEXT: retl 4783; 4784; X64-LABEL: test_mm256_mask_fmadd_ps: 4785; X64: # %bb.0: # %entry 4786; X64-NEXT: kmovw %edi, %k1 4787; X64-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 4788; X64-NEXT: retq 4789entry: 4790 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 4791 %1 = bitcast i8 %__U to <8 x i1> 4792 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 4793 ret <8 x float> %2 4794} 4795 4796define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 4797; X86-LABEL: test_mm256_mask_fmsub_ps: 4798; X86: # %bb.0: # %entry 4799; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4800; X86-NEXT: kmovw %eax, %k1 4801; X86-NEXT: vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 4802; X86-NEXT: retl 4803; 4804; X64-LABEL: test_mm256_mask_fmsub_ps: 4805; X64: # %bb.0: # %entry 4806; X64-NEXT: kmovw %edi, %k1 4807; X64-NEXT: vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 4808; X64-NEXT: retq 4809entry: 4810 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4811 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 4812 %1 = bitcast i8 %__U to <8 x i1> 4813 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 4814 ret <8 x float> %2 4815} 4816 4817define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 4818; X86-LABEL: test_mm256_mask3_fmadd_ps: 4819; X86: # %bb.0: # %entry 4820; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4821; X86-NEXT: kmovw %eax, %k1 4822; X86-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 4823; X86-NEXT: vmovaps %ymm2, %ymm0 4824; X86-NEXT: retl 4825; 4826; X64-LABEL: test_mm256_mask3_fmadd_ps: 4827; X64: # %bb.0: # %entry 4828; X64-NEXT: kmovw %edi, %k1 4829; X64-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 4830; X64-NEXT: vmovaps %ymm2, %ymm0 4831; X64-NEXT: retq 4832entry: 4833 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 4834 %1 = bitcast i8 %__U to <8 x i1> 4835 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 4836 ret <8 x float> %2 4837} 4838 4839define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 4840; X86-LABEL: test_mm256_mask3_fnmadd_ps: 4841; X86: # %bb.0: # %entry 4842; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4843; X86-NEXT: kmovw %eax, %k1 4844; X86-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 4845; X86-NEXT: vmovaps %ymm2, %ymm0 4846; X86-NEXT: retl 4847; 4848; X64-LABEL: test_mm256_mask3_fnmadd_ps: 4849; X64: # %bb.0: # %entry 4850; X64-NEXT: kmovw %edi, %k1 4851; X64-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 4852; X64-NEXT: vmovaps %ymm2, %ymm0 4853; X64-NEXT: retq 4854entry: 4855 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4856 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9 4857 %1 = bitcast i8 %__U to <8 x i1> 4858 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 4859 ret <8 x float> %2 4860} 4861 4862define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4863; X86-LABEL: test_mm256_maskz_fmadd_ps: 4864; X86: # %bb.0: # %entry 4865; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4866; X86-NEXT: kmovw %eax, %k1 4867; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 4868; X86-NEXT: retl 4869; 4870; X64-LABEL: test_mm256_maskz_fmadd_ps: 4871; X64: # %bb.0: # %entry 4872; X64-NEXT: kmovw %edi, %k1 4873; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 4874; X64-NEXT: retq 4875entry: 4876 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 4877 %1 = bitcast i8 %__U to <8 x i1> 4878 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 4879 ret <8 x float> %2 4880} 4881 4882define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4883; X86-LABEL: test_mm256_maskz_fmsub_ps: 4884; X86: # %bb.0: # %entry 4885; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4886; X86-NEXT: kmovw %eax, %k1 4887; X86-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 4888; X86-NEXT: retl 4889; 4890; X64-LABEL: test_mm256_maskz_fmsub_ps: 4891; X64: # %bb.0: # %entry 4892; X64-NEXT: kmovw %edi, %k1 4893; X64-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 4894; X64-NEXT: retq 4895entry: 4896 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4897 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 4898 %1 = bitcast i8 %__U to <8 x i1> 4899 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 4900 ret <8 x float> %2 4901} 4902 4903define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4904; X86-LABEL: test_mm256_maskz_fnmadd_ps: 4905; X86: # %bb.0: # %entry 4906; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4907; X86-NEXT: kmovw %eax, %k1 4908; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 4909; X86-NEXT: retl 4910; 4911; X64-LABEL: test_mm256_maskz_fnmadd_ps: 4912; X64: # %bb.0: # %entry 4913; X64-NEXT: kmovw %edi, %k1 4914; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 4915; X64-NEXT: retq 4916entry: 4917 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4918 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9 4919 %1 = bitcast i8 %__U to <8 x i1> 4920 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 4921 ret <8 x float> %2 4922} 4923 4924define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4925; X86-LABEL: test_mm256_maskz_fnmsub_ps: 4926; X86: # %bb.0: # %entry 4927; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4928; X86-NEXT: kmovw %eax, %k1 4929; X86-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 4930; X86-NEXT: retl 4931; 4932; X64-LABEL: test_mm256_maskz_fnmsub_ps: 4933; X64: # %bb.0: # %entry 4934; X64-NEXT: kmovw %edi, %k1 4935; X64-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 4936; X64-NEXT: retq 4937entry: 4938 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4939 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4940 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9 4941 %1 = bitcast i8 %__U to <8 x i1> 4942 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 4943 ret <8 x float> %2 4944} 4945 4946define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 4947; X86-LABEL: test_mm_mask_fmaddsub_pd: 4948; X86: # %bb.0: # %entry 4949; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4950; X86-NEXT: kmovw %eax, %k1 4951; X86-NEXT: vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 4952; X86-NEXT: retl 4953; 4954; X64-LABEL: test_mm_mask_fmaddsub_pd: 4955; X64: # %bb.0: # %entry 4956; X64-NEXT: kmovw %edi, %k1 4957; X64-NEXT: vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 4958; X64-NEXT: retq 4959entry: 4960 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4961 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4962 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 4963 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 4964 %4 = bitcast i8 %__U to <8 x i1> 4965 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4966 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A 4967 ret <2 x double> %5 4968} 4969 4970define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 4971; X86-LABEL: test_mm_mask_fmsubadd_pd: 4972; X86: # %bb.0: # %entry 4973; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4974; X86-NEXT: kmovw %eax, %k1 4975; X86-NEXT: vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 4976; X86-NEXT: retl 4977; 4978; X64-LABEL: test_mm_mask_fmsubadd_pd: 4979; X64: # %bb.0: # %entry 4980; X64-NEXT: kmovw %edi, %k1 4981; X64-NEXT: vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 4982; X64-NEXT: retq 4983entry: 4984 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4985 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 4986 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4987 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 4988 %3 = bitcast i8 %__U to <8 x i1> 4989 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4990 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A 4991 ret <2 x double> %4 4992} 4993 4994define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 4995; X86-LABEL: test_mm_mask3_fmaddsub_pd: 4996; X86: # %bb.0: # %entry 4997; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4998; X86-NEXT: kmovw %eax, %k1 4999; X86-NEXT: vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2 5000; X86-NEXT: vmovapd %xmm2, %xmm0 5001; X86-NEXT: retl 5002; 5003; X64-LABEL: test_mm_mask3_fmaddsub_pd: 5004; X64: # %bb.0: # %entry 5005; X64-NEXT: kmovw %edi, %k1 5006; X64-NEXT: vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2 5007; X64-NEXT: vmovapd %xmm2, %xmm0 5008; X64-NEXT: retq 5009entry: 5010 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5011 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5012 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 5013 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5014 %4 = bitcast i8 %__U to <8 x i1> 5015 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5016 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C 5017 ret <2 x double> %5 5018} 5019 5020define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5021; X86-LABEL: test_mm_maskz_fmaddsub_pd: 5022; X86: # %bb.0: # %entry 5023; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5024; X86-NEXT: kmovw %eax, %k1 5025; X86-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 5026; X86-NEXT: retl 5027; 5028; X64-LABEL: test_mm_maskz_fmaddsub_pd: 5029; X64: # %bb.0: # %entry 5030; X64-NEXT: kmovw %edi, %k1 5031; X64-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 5032; X64-NEXT: retq 5033entry: 5034 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5035 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5036 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 5037 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5038 %4 = bitcast i8 %__U to <8 x i1> 5039 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5040 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer 5041 ret <2 x double> %5 5042} 5043 5044define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5045; X86-LABEL: test_mm_maskz_fmsubadd_pd: 5046; X86: # %bb.0: # %entry 5047; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5048; X86-NEXT: kmovw %eax, %k1 5049; X86-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 5050; X86-NEXT: retl 5051; 5052; X64-LABEL: test_mm_maskz_fmsubadd_pd: 5053; X64: # %bb.0: # %entry 5054; X64-NEXT: kmovw %edi, %k1 5055; X64-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 5056; X64-NEXT: retq 5057entry: 5058 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5059 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 5060 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5061 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5062 %3 = bitcast i8 %__U to <8 x i1> 5063 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5064 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer 5065 ret <2 x double> %4 5066} 5067 5068define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5069; X86-LABEL: test_mm256_mask_fmaddsub_pd: 5070; X86: # %bb.0: # %entry 5071; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5072; X86-NEXT: kmovw %eax, %k1 5073; X86-NEXT: vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 5074; X86-NEXT: retl 5075; 5076; X64-LABEL: test_mm256_mask_fmaddsub_pd: 5077; X64: # %bb.0: # %entry 5078; X64-NEXT: kmovw %edi, %k1 5079; X64-NEXT: vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 5080; X64-NEXT: retq 5081entry: 5082 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5083 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5084 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 5085 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5086 %4 = bitcast i8 %__U to <8 x i1> 5087 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5088 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A 5089 ret <4 x double> %5 5090} 5091 5092define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5093; X86-LABEL: test_mm256_mask_fmsubadd_pd: 5094; X86: # %bb.0: # %entry 5095; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5096; X86-NEXT: kmovw %eax, %k1 5097; X86-NEXT: vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 5098; X86-NEXT: retl 5099; 5100; X64-LABEL: test_mm256_mask_fmsubadd_pd: 5101; X64: # %bb.0: # %entry 5102; X64-NEXT: kmovw %edi, %k1 5103; X64-NEXT: vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 5104; X64-NEXT: retq 5105entry: 5106 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5107 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5108 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5109 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5110 %3 = bitcast i8 %__U to <8 x i1> 5111 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5112 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A 5113 ret <4 x double> %4 5114} 5115 5116define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5117; X86-LABEL: test_mm256_mask3_fmaddsub_pd: 5118; X86: # %bb.0: # %entry 5119; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5120; X86-NEXT: kmovw %eax, %k1 5121; X86-NEXT: vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2 5122; X86-NEXT: vmovapd %ymm2, %ymm0 5123; X86-NEXT: retl 5124; 5125; X64-LABEL: test_mm256_mask3_fmaddsub_pd: 5126; X64: # %bb.0: # %entry 5127; X64-NEXT: kmovw %edi, %k1 5128; X64-NEXT: vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2 5129; X64-NEXT: vmovapd %ymm2, %ymm0 5130; X64-NEXT: retq 5131entry: 5132 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5133 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5134 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 5135 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5136 %4 = bitcast i8 %__U to <8 x i1> 5137 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5138 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C 5139 ret <4 x double> %5 5140} 5141 5142define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 5143; X86-LABEL: test_mm256_maskz_fmaddsub_pd: 5144; X86: # %bb.0: # %entry 5145; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5146; X86-NEXT: kmovw %eax, %k1 5147; X86-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 5148; X86-NEXT: retl 5149; 5150; X64-LABEL: test_mm256_maskz_fmaddsub_pd: 5151; X64: # %bb.0: # %entry 5152; X64-NEXT: kmovw %edi, %k1 5153; X64-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 5154; X64-NEXT: retq 5155entry: 5156 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5157 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5158 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 5159 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5160 %4 = bitcast i8 %__U to <8 x i1> 5161 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5162 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer 5163 ret <4 x double> %5 5164} 5165 5166define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 5167; X86-LABEL: test_mm256_maskz_fmsubadd_pd: 5168; X86: # %bb.0: # %entry 5169; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5170; X86-NEXT: kmovw %eax, %k1 5171; X86-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 5172; X86-NEXT: retl 5173; 5174; X64-LABEL: test_mm256_maskz_fmsubadd_pd: 5175; X64: # %bb.0: # %entry 5176; X64-NEXT: kmovw %edi, %k1 5177; X64-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 5178; X64-NEXT: retq 5179entry: 5180 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5181 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5182 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5183 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5184 %3 = bitcast i8 %__U to <8 x i1> 5185 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5186 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer 5187 ret <4 x double> %4 5188} 5189 5190define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5191; X86-LABEL: test_mm_mask_fmaddsub_ps: 5192; X86: # %bb.0: # %entry 5193; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5194; X86-NEXT: kmovw %eax, %k1 5195; X86-NEXT: vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 5196; X86-NEXT: retl 5197; 5198; X64-LABEL: test_mm_mask_fmaddsub_ps: 5199; X64: # %bb.0: # %entry 5200; X64-NEXT: kmovw %edi, %k1 5201; X64-NEXT: vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2 5202; X64-NEXT: retq 5203entry: 5204 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5205 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5206 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 5207 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5208 %4 = bitcast i8 %__U to <8 x i1> 5209 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5210 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A 5211 ret <4 x float> %5 5212} 5213 5214define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5215; X86-LABEL: test_mm_mask_fmsubadd_ps: 5216; X86: # %bb.0: # %entry 5217; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5218; X86-NEXT: kmovw %eax, %k1 5219; X86-NEXT: vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 5220; X86-NEXT: retl 5221; 5222; X64-LABEL: test_mm_mask_fmsubadd_ps: 5223; X64: # %bb.0: # %entry 5224; X64-NEXT: kmovw %edi, %k1 5225; X64-NEXT: vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2 5226; X64-NEXT: retq 5227entry: 5228 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5229 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5230 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5231 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5232 %3 = bitcast i8 %__U to <8 x i1> 5233 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5234 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A 5235 ret <4 x float> %4 5236} 5237 5238define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5239; X86-LABEL: test_mm_mask3_fmaddsub_ps: 5240; X86: # %bb.0: # %entry 5241; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5242; X86-NEXT: kmovw %eax, %k1 5243; X86-NEXT: vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2 5244; X86-NEXT: vmovaps %xmm2, %xmm0 5245; X86-NEXT: retl 5246; 5247; X64-LABEL: test_mm_mask3_fmaddsub_ps: 5248; X64: # %bb.0: # %entry 5249; X64-NEXT: kmovw %edi, %k1 5250; X64-NEXT: vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2 5251; X64-NEXT: vmovaps %xmm2, %xmm0 5252; X64-NEXT: retq 5253entry: 5254 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5255 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5256 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 5257 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5258 %4 = bitcast i8 %__U to <8 x i1> 5259 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5260 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C 5261 ret <4 x float> %5 5262} 5263 5264define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5265; X86-LABEL: test_mm_maskz_fmaddsub_ps: 5266; X86: # %bb.0: # %entry 5267; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5268; X86-NEXT: kmovw %eax, %k1 5269; X86-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 5270; X86-NEXT: retl 5271; 5272; X64-LABEL: test_mm_maskz_fmaddsub_ps: 5273; X64: # %bb.0: # %entry 5274; X64-NEXT: kmovw %edi, %k1 5275; X64-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 5276; X64-NEXT: retq 5277entry: 5278 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5279 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5280 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 5281 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5282 %4 = bitcast i8 %__U to <8 x i1> 5283 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5284 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer 5285 ret <4 x float> %5 5286} 5287 5288define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5289; X86-LABEL: test_mm_maskz_fmsubadd_ps: 5290; X86: # %bb.0: # %entry 5291; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5292; X86-NEXT: kmovw %eax, %k1 5293; X86-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 5294; X86-NEXT: retl 5295; 5296; X64-LABEL: test_mm_maskz_fmsubadd_ps: 5297; X64: # %bb.0: # %entry 5298; X64-NEXT: kmovw %edi, %k1 5299; X64-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 5300; X64-NEXT: retq 5301entry: 5302 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5303 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5304 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5305 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5306 %3 = bitcast i8 %__U to <8 x i1> 5307 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5308 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer 5309 ret <4 x float> %4 5310} 5311 5312define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5313; X86-LABEL: test_mm256_mask_fmaddsub_ps: 5314; X86: # %bb.0: # %entry 5315; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5316; X86-NEXT: kmovw %eax, %k1 5317; X86-NEXT: vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 5318; X86-NEXT: retl 5319; 5320; X64-LABEL: test_mm256_mask_fmaddsub_ps: 5321; X64: # %bb.0: # %entry 5322; X64-NEXT: kmovw %edi, %k1 5323; X64-NEXT: vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2 5324; X64-NEXT: retq 5325entry: 5326 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5327 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5328 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 5329 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5330 %4 = bitcast i8 %__U to <8 x i1> 5331 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A 5332 ret <8 x float> %5 5333} 5334 5335define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5336; X86-LABEL: test_mm256_mask_fmsubadd_ps: 5337; X86: # %bb.0: # %entry 5338; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5339; X86-NEXT: kmovw %eax, %k1 5340; X86-NEXT: vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 5341; X86-NEXT: retl 5342; 5343; X64-LABEL: test_mm256_mask_fmsubadd_ps: 5344; X64: # %bb.0: # %entry 5345; X64-NEXT: kmovw %edi, %k1 5346; X64-NEXT: vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2 5347; X64-NEXT: retq 5348entry: 5349 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5350 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5351 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5352 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5353 %3 = bitcast i8 %__U to <8 x i1> 5354 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A 5355 ret <8 x float> %4 5356} 5357 5358define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5359; X86-LABEL: test_mm256_mask3_fmaddsub_ps: 5360; X86: # %bb.0: # %entry 5361; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5362; X86-NEXT: kmovw %eax, %k1 5363; X86-NEXT: vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2 5364; X86-NEXT: vmovaps %ymm2, %ymm0 5365; X86-NEXT: retl 5366; 5367; X64-LABEL: test_mm256_mask3_fmaddsub_ps: 5368; X64: # %bb.0: # %entry 5369; X64-NEXT: kmovw %edi, %k1 5370; X64-NEXT: vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2 5371; X64-NEXT: vmovaps %ymm2, %ymm0 5372; X64-NEXT: retq 5373entry: 5374 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5375 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5376 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 5377 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5378 %4 = bitcast i8 %__U to <8 x i1> 5379 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C 5380 ret <8 x float> %5 5381} 5382 5383define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 5384; X86-LABEL: test_mm256_maskz_fmaddsub_ps: 5385; X86: # %bb.0: # %entry 5386; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5387; X86-NEXT: kmovw %eax, %k1 5388; X86-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 5389; X86-NEXT: retl 5390; 5391; X64-LABEL: test_mm256_maskz_fmaddsub_ps: 5392; X64: # %bb.0: # %entry 5393; X64-NEXT: kmovw %edi, %k1 5394; X64-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 5395; X64-NEXT: retq 5396entry: 5397 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5398 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5399 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 5400 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5401 %4 = bitcast i8 %__U to <8 x i1> 5402 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer 5403 ret <8 x float> %5 5404} 5405 5406define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 5407; X86-LABEL: test_mm256_maskz_fmsubadd_ps: 5408; X86: # %bb.0: # %entry 5409; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5410; X86-NEXT: kmovw %eax, %k1 5411; X86-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 5412; X86-NEXT: retl 5413; 5414; X64-LABEL: test_mm256_maskz_fmsubadd_ps: 5415; X64: # %bb.0: # %entry 5416; X64-NEXT: kmovw %edi, %k1 5417; X64-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 5418; X64-NEXT: retq 5419entry: 5420 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5421 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5422 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5423 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5424 %3 = bitcast i8 %__U to <8 x i1> 5425 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer 5426 ret <8 x float> %4 5427} 5428 5429define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 5430; X86-LABEL: test_mm_mask3_fmsub_pd: 5431; X86: # %bb.0: # %entry 5432; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5433; X86-NEXT: kmovw %eax, %k1 5434; X86-NEXT: vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5435; X86-NEXT: vmovapd %xmm2, %xmm0 5436; X86-NEXT: retl 5437; 5438; X64-LABEL: test_mm_mask3_fmsub_pd: 5439; X64: # %bb.0: # %entry 5440; X64-NEXT: kmovw %edi, %k1 5441; X64-NEXT: vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5442; X64-NEXT: vmovapd %xmm2, %xmm0 5443; X64-NEXT: retq 5444entry: 5445 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5446 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 5447 %1 = bitcast i8 %__U to <8 x i1> 5448 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5449 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 5450 ret <2 x double> %2 5451} 5452 5453define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5454; X86-LABEL: test_mm256_mask3_fmsub_pd: 5455; X86: # %bb.0: # %entry 5456; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5457; X86-NEXT: kmovw %eax, %k1 5458; X86-NEXT: vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 5459; X86-NEXT: vmovapd %ymm2, %ymm0 5460; X86-NEXT: retl 5461; 5462; X64-LABEL: test_mm256_mask3_fmsub_pd: 5463; X64: # %bb.0: # %entry 5464; X64-NEXT: kmovw %edi, %k1 5465; X64-NEXT: vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 5466; X64-NEXT: vmovapd %ymm2, %ymm0 5467; X64-NEXT: retq 5468entry: 5469 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5470 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5471 %1 = bitcast i8 %__U to <8 x i1> 5472 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5473 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 5474 ret <4 x double> %2 5475} 5476 5477define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5478; X86-LABEL: test_mm_mask3_fmsub_ps: 5479; X86: # %bb.0: # %entry 5480; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5481; X86-NEXT: kmovw %eax, %k1 5482; X86-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5483; X86-NEXT: vmovaps %xmm2, %xmm0 5484; X86-NEXT: retl 5485; 5486; X64-LABEL: test_mm_mask3_fmsub_ps: 5487; X64: # %bb.0: # %entry 5488; X64-NEXT: kmovw %edi, %k1 5489; X64-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5490; X64-NEXT: vmovaps %xmm2, %xmm0 5491; X64-NEXT: retq 5492entry: 5493 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5494 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5495 %1 = bitcast i8 %__U to <8 x i1> 5496 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5497 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 5498 ret <4 x float> %2 5499} 5500 5501define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5502; X86-LABEL: test_mm256_mask3_fmsub_ps: 5503; X86: # %bb.0: # %entry 5504; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5505; X86-NEXT: kmovw %eax, %k1 5506; X86-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 5507; X86-NEXT: vmovaps %ymm2, %ymm0 5508; X86-NEXT: retl 5509; 5510; X64-LABEL: test_mm256_mask3_fmsub_ps: 5511; X64: # %bb.0: # %entry 5512; X64-NEXT: kmovw %edi, %k1 5513; X64-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 5514; X64-NEXT: vmovaps %ymm2, %ymm0 5515; X64-NEXT: retq 5516entry: 5517 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5518 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5519 %1 = bitcast i8 %__U to <8 x i1> 5520 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 5521 ret <8 x float> %2 5522} 5523 5524define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 5525; X86-LABEL: test_mm_mask3_fmsubadd_pd: 5526; X86: # %bb.0: # %entry 5527; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5528; X86-NEXT: kmovw %eax, %k1 5529; X86-NEXT: vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2 5530; X86-NEXT: vmovapd %xmm2, %xmm0 5531; X86-NEXT: retl 5532; 5533; X64-LABEL: test_mm_mask3_fmsubadd_pd: 5534; X64: # %bb.0: # %entry 5535; X64-NEXT: kmovw %edi, %k1 5536; X64-NEXT: vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2 5537; X64-NEXT: vmovapd %xmm2, %xmm0 5538; X64-NEXT: retq 5539entry: 5540 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5541 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 5542 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5543 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5544 %3 = bitcast i8 %__U to <8 x i1> 5545 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5546 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C 5547 ret <2 x double> %4 5548} 5549 5550define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5551; X86-LABEL: test_mm256_mask3_fmsubadd_pd: 5552; X86: # %bb.0: # %entry 5553; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5554; X86-NEXT: kmovw %eax, %k1 5555; X86-NEXT: vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2 5556; X86-NEXT: vmovapd %ymm2, %ymm0 5557; X86-NEXT: retl 5558; 5559; X64-LABEL: test_mm256_mask3_fmsubadd_pd: 5560; X64: # %bb.0: # %entry 5561; X64-NEXT: kmovw %edi, %k1 5562; X64-NEXT: vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2 5563; X64-NEXT: vmovapd %ymm2, %ymm0 5564; X64-NEXT: retq 5565entry: 5566 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5567 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5568 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5569 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5570 %3 = bitcast i8 %__U to <8 x i1> 5571 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5572 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C 5573 ret <4 x double> %4 5574} 5575 5576define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5577; X86-LABEL: test_mm_mask3_fmsubadd_ps: 5578; X86: # %bb.0: # %entry 5579; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5580; X86-NEXT: kmovw %eax, %k1 5581; X86-NEXT: vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2 5582; X86-NEXT: vmovaps %xmm2, %xmm0 5583; X86-NEXT: retl 5584; 5585; X64-LABEL: test_mm_mask3_fmsubadd_ps: 5586; X64: # %bb.0: # %entry 5587; X64-NEXT: kmovw %edi, %k1 5588; X64-NEXT: vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2 5589; X64-NEXT: vmovaps %xmm2, %xmm0 5590; X64-NEXT: retq 5591entry: 5592 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5593 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5594 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5595 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5596 %3 = bitcast i8 %__U to <8 x i1> 5597 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5598 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C 5599 ret <4 x float> %4 5600} 5601 5602define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5603; X86-LABEL: test_mm256_mask3_fmsubadd_ps: 5604; X86: # %bb.0: # %entry 5605; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5606; X86-NEXT: kmovw %eax, %k1 5607; X86-NEXT: vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2 5608; X86-NEXT: vmovaps %ymm2, %ymm0 5609; X86-NEXT: retl 5610; 5611; X64-LABEL: test_mm256_mask3_fmsubadd_ps: 5612; X64: # %bb.0: # %entry 5613; X64-NEXT: kmovw %edi, %k1 5614; X64-NEXT: vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2 5615; X64-NEXT: vmovaps %ymm2, %ymm0 5616; X64-NEXT: retq 5617entry: 5618 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5619 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5620 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5621 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5622 %3 = bitcast i8 %__U to <8 x i1> 5623 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C 5624 ret <8 x float> %4 5625} 5626 5627define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 5628; X86-LABEL: test_mm_mask_fnmadd_pd: 5629; X86: # %bb.0: # %entry 5630; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5631; X86-NEXT: kmovw %eax, %k1 5632; X86-NEXT: vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 5633; X86-NEXT: retl 5634; 5635; X64-LABEL: test_mm_mask_fnmadd_pd: 5636; X64: # %bb.0: # %entry 5637; X64-NEXT: kmovw %edi, %k1 5638; X64-NEXT: vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 5639; X64-NEXT: retq 5640entry: 5641 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 5642 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9 5643 %1 = bitcast i8 %__U to <8 x i1> 5644 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5645 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 5646 ret <2 x double> %2 5647} 5648 5649define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5650; X86-LABEL: test_mm256_mask_fnmadd_pd: 5651; X86: # %bb.0: # %entry 5652; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5653; X86-NEXT: kmovw %eax, %k1 5654; X86-NEXT: vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2 5655; X86-NEXT: retl 5656; 5657; X64-LABEL: test_mm256_mask_fnmadd_pd: 5658; X64: # %bb.0: # %entry 5659; X64-NEXT: kmovw %edi, %k1 5660; X64-NEXT: vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2 5661; X64-NEXT: retq 5662entry: 5663 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 5664 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9 5665 %1 = bitcast i8 %__U to <8 x i1> 5666 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5667 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 5668 ret <4 x double> %2 5669} 5670 5671define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5672; X86-LABEL: test_mm_mask_fnmadd_ps: 5673; X86: # %bb.0: # %entry 5674; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5675; X86-NEXT: kmovw %eax, %k1 5676; X86-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 5677; X86-NEXT: retl 5678; 5679; X64-LABEL: test_mm_mask_fnmadd_ps: 5680; X64: # %bb.0: # %entry 5681; X64-NEXT: kmovw %edi, %k1 5682; X64-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2 5683; X64-NEXT: retq 5684entry: 5685 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5686 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9 5687 %1 = bitcast i8 %__U to <8 x i1> 5688 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5689 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 5690 ret <4 x float> %2 5691} 5692 5693define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5694; X86-LABEL: test_mm256_mask_fnmadd_ps: 5695; X86: # %bb.0: # %entry 5696; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5697; X86-NEXT: kmovw %eax, %k1 5698; X86-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2 5699; X86-NEXT: retl 5700; 5701; X64-LABEL: test_mm256_mask_fnmadd_ps: 5702; X64: # %bb.0: # %entry 5703; X64-NEXT: kmovw %edi, %k1 5704; X64-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2 5705; X64-NEXT: retq 5706entry: 5707 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5708 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9 5709 %1 = bitcast i8 %__U to <8 x i1> 5710 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 5711 ret <8 x float> %2 5712} 5713 5714define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 5715; X86-LABEL: test_mm_mask_fnmsub_pd: 5716; X86: # %bb.0: # %entry 5717; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5718; X86-NEXT: kmovw %eax, %k1 5719; X86-NEXT: vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 5720; X86-NEXT: retl 5721; 5722; X64-LABEL: test_mm_mask_fnmsub_pd: 5723; X64: # %bb.0: # %entry 5724; X64-NEXT: kmovw %edi, %k1 5725; X64-NEXT: vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 5726; X64-NEXT: retq 5727entry: 5728 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 5729 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5730 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9 5731 %1 = bitcast i8 %__U to <8 x i1> 5732 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5733 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 5734 ret <2 x double> %2 5735} 5736 5737define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 5738; X86-LABEL: test_mm_mask3_fnmsub_pd: 5739; X86: # %bb.0: # %entry 5740; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5741; X86-NEXT: kmovw %eax, %k1 5742; X86-NEXT: vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5743; X86-NEXT: vmovapd %xmm2, %xmm0 5744; X86-NEXT: retl 5745; 5746; X64-LABEL: test_mm_mask3_fnmsub_pd: 5747; X64: # %bb.0: # %entry 5748; X64-NEXT: kmovw %edi, %k1 5749; X64-NEXT: vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5750; X64-NEXT: vmovapd %xmm2, %xmm0 5751; X64-NEXT: retq 5752entry: 5753 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 5754 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5755 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9 5756 %1 = bitcast i8 %__U to <8 x i1> 5757 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5758 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 5759 ret <2 x double> %2 5760} 5761 5762define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5763; X86-LABEL: test_mm256_mask_fnmsub_pd: 5764; X86: # %bb.0: # %entry 5765; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5766; X86-NEXT: kmovw %eax, %k1 5767; X86-NEXT: vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2 5768; X86-NEXT: retl 5769; 5770; X64-LABEL: test_mm256_mask_fnmsub_pd: 5771; X64: # %bb.0: # %entry 5772; X64-NEXT: kmovw %edi, %k1 5773; X64-NEXT: vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2 5774; X64-NEXT: retq 5775entry: 5776 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 5777 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5778 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9 5779 %1 = bitcast i8 %__U to <8 x i1> 5780 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5781 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 5782 ret <4 x double> %2 5783} 5784 5785define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5786; X86-LABEL: test_mm256_mask3_fnmsub_pd: 5787; X86: # %bb.0: # %entry 5788; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5789; X86-NEXT: kmovw %eax, %k1 5790; X86-NEXT: vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2 5791; X86-NEXT: vmovapd %ymm2, %ymm0 5792; X86-NEXT: retl 5793; 5794; X64-LABEL: test_mm256_mask3_fnmsub_pd: 5795; X64: # %bb.0: # %entry 5796; X64-NEXT: kmovw %edi, %k1 5797; X64-NEXT: vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2 5798; X64-NEXT: vmovapd %ymm2, %ymm0 5799; X64-NEXT: retq 5800entry: 5801 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 5802 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5803 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9 5804 %1 = bitcast i8 %__U to <8 x i1> 5805 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5806 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 5807 ret <4 x double> %2 5808} 5809 5810define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5811; X86-LABEL: test_mm_mask_fnmsub_ps: 5812; X86: # %bb.0: # %entry 5813; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5814; X86-NEXT: kmovw %eax, %k1 5815; X86-NEXT: vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 5816; X86-NEXT: retl 5817; 5818; X64-LABEL: test_mm_mask_fnmsub_ps: 5819; X64: # %bb.0: # %entry 5820; X64-NEXT: kmovw %edi, %k1 5821; X64-NEXT: vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 5822; X64-NEXT: retq 5823entry: 5824 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5825 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5826 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9 5827 %1 = bitcast i8 %__U to <8 x i1> 5828 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5829 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 5830 ret <4 x float> %2 5831} 5832 5833define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5834; X86-LABEL: test_mm_mask3_fnmsub_ps: 5835; X86: # %bb.0: # %entry 5836; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5837; X86-NEXT: kmovw %eax, %k1 5838; X86-NEXT: vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5839; X86-NEXT: vmovaps %xmm2, %xmm0 5840; X86-NEXT: retl 5841; 5842; X64-LABEL: test_mm_mask3_fnmsub_ps: 5843; X64: # %bb.0: # %entry 5844; X64-NEXT: kmovw %edi, %k1 5845; X64-NEXT: vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5846; X64-NEXT: vmovaps %xmm2, %xmm0 5847; X64-NEXT: retq 5848entry: 5849 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5850 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5851 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9 5852 %1 = bitcast i8 %__U to <8 x i1> 5853 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5854 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 5855 ret <4 x float> %2 5856} 5857 5858define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5859; X86-LABEL: test_mm256_mask_fnmsub_ps: 5860; X86: # %bb.0: # %entry 5861; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5862; X86-NEXT: kmovw %eax, %k1 5863; X86-NEXT: vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2 5864; X86-NEXT: retl 5865; 5866; X64-LABEL: test_mm256_mask_fnmsub_ps: 5867; X64: # %bb.0: # %entry 5868; X64-NEXT: kmovw %edi, %k1 5869; X64-NEXT: vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2 5870; X64-NEXT: retq 5871entry: 5872 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5873 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5874 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9 5875 %1 = bitcast i8 %__U to <8 x i1> 5876 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 5877 ret <8 x float> %2 5878} 5879 5880define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5881; X86-LABEL: test_mm256_mask3_fnmsub_ps: 5882; X86: # %bb.0: # %entry 5883; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5884; X86-NEXT: kmovw %eax, %k1 5885; X86-NEXT: vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2 5886; X86-NEXT: vmovaps %ymm2, %ymm0 5887; X86-NEXT: retl 5888; 5889; X64-LABEL: test_mm256_mask3_fnmsub_ps: 5890; X64: # %bb.0: # %entry 5891; X64-NEXT: kmovw %edi, %k1 5892; X64-NEXT: vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2 5893; X64-NEXT: vmovaps %ymm2, %ymm0 5894; X64-NEXT: retq 5895entry: 5896 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5897 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5898 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9 5899 %1 = bitcast i8 %__U to <8 x i1> 5900 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 5901 ret <8 x float> %2 5902} 5903 5904define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { 5905; X86-LABEL: test_mm_mask_expandloadu_pd: 5906; X86: # %bb.0: # %entry 5907; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 5908; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 5909; X86-NEXT: kmovw %ecx, %k1 5910; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} 5911; X86-NEXT: retl 5912; 5913; X64-LABEL: test_mm_mask_expandloadu_pd: 5914; X64: # %bb.0: # %entry 5915; X64-NEXT: kmovw %edi, %k1 5916; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} 5917; X64-NEXT: retq 5918entry: 5919 %0 = bitcast i8* %__P to double* 5920 %1 = bitcast i8 %__U to <8 x i1> 5921 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5922 %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> %__W) 5923 ret <2 x double> %2 5924} 5925 5926define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { 5927; X86-LABEL: test_mm_maskz_expandloadu_pd: 5928; X86: # %bb.0: # %entry 5929; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 5930; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 5931; X86-NEXT: kmovw %ecx, %k1 5932; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} {z} 5933; X86-NEXT: retl 5934; 5935; X64-LABEL: test_mm_maskz_expandloadu_pd: 5936; X64: # %bb.0: # %entry 5937; X64-NEXT: kmovw %edi, %k1 5938; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} {z} 5939; X64-NEXT: retq 5940entry: 5941 %0 = bitcast i8* %__P to double* 5942 %1 = bitcast i8 %__U to <8 x i1> 5943 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5944 %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> zeroinitializer) 5945 ret <2 x double> %2 5946} 5947 5948define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { 5949; X86-LABEL: test_mm256_mask_expandloadu_pd: 5950; X86: # %bb.0: # %entry 5951; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 5952; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 5953; X86-NEXT: kmovw %ecx, %k1 5954; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} 5955; X86-NEXT: retl 5956; 5957; X64-LABEL: test_mm256_mask_expandloadu_pd: 5958; X64: # %bb.0: # %entry 5959; X64-NEXT: kmovw %edi, %k1 5960; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} 5961; X64-NEXT: retq 5962entry: 5963 %0 = bitcast i8* %__P to double* 5964 %1 = bitcast i8 %__U to <8 x i1> 5965 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5966 %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> %__W) 5967 ret <4 x double> %2 5968} 5969 5970define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { 5971; X86-LABEL: test_mm256_maskz_expandloadu_pd: 5972; X86: # %bb.0: # %entry 5973; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 5974; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 5975; X86-NEXT: kmovw %ecx, %k1 5976; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} {z} 5977; X86-NEXT: retl 5978; 5979; X64-LABEL: test_mm256_maskz_expandloadu_pd: 5980; X64: # %bb.0: # %entry 5981; X64-NEXT: kmovw %edi, %k1 5982; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} {z} 5983; X64-NEXT: retq 5984entry: 5985 %0 = bitcast i8* %__P to double* 5986 %1 = bitcast i8 %__U to <8 x i1> 5987 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5988 %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> zeroinitializer) 5989 ret <4 x double> %2 5990} 5991 5992define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 5993; X86-LABEL: test_mm_mask_expandloadu_epi64: 5994; X86: # %bb.0: # %entry 5995; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 5996; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 5997; X86-NEXT: kmovw %ecx, %k1 5998; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} 5999; X86-NEXT: retl 6000; 6001; X64-LABEL: test_mm_mask_expandloadu_epi64: 6002; X64: # %bb.0: # %entry 6003; X64-NEXT: kmovw %edi, %k1 6004; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} 6005; X64-NEXT: retq 6006entry: 6007 %0 = bitcast i8* %__P to i64* 6008 %1 = bitcast i8 %__U to <8 x i1> 6009 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6010 %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> %__W) #10 6011 ret <2 x i64> %2 6012} 6013 6014define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { 6015; X86-LABEL: test_mm_maskz_expandloadu_epi64: 6016; X86: # %bb.0: # %entry 6017; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6018; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6019; X86-NEXT: kmovw %ecx, %k1 6020; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} {z} 6021; X86-NEXT: retl 6022; 6023; X64-LABEL: test_mm_maskz_expandloadu_epi64: 6024; X64: # %bb.0: # %entry 6025; X64-NEXT: kmovw %edi, %k1 6026; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} {z} 6027; X64-NEXT: retq 6028entry: 6029 %0 = bitcast i8* %__P to i64* 6030 %1 = bitcast i8 %__U to <8 x i1> 6031 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6032 %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> zeroinitializer) 6033 ret <2 x i64> %2 6034} 6035 6036define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6037; X86-LABEL: test_mm256_mask_expandloadu_epi64: 6038; X86: # %bb.0: # %entry 6039; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6040; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6041; X86-NEXT: kmovw %ecx, %k1 6042; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} 6043; X86-NEXT: retl 6044; 6045; X64-LABEL: test_mm256_mask_expandloadu_epi64: 6046; X64: # %bb.0: # %entry 6047; X64-NEXT: kmovw %edi, %k1 6048; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} 6049; X64-NEXT: retq 6050entry: 6051 %0 = bitcast i8* %__P to i64* 6052 %1 = bitcast i8 %__U to <8 x i1> 6053 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6054 %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> %__W) #10 6055 ret <4 x i64> %2 6056} 6057 6058define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { 6059; X86-LABEL: test_mm256_maskz_expandloadu_epi64: 6060; X86: # %bb.0: # %entry 6061; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6062; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6063; X86-NEXT: kmovw %ecx, %k1 6064; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} {z} 6065; X86-NEXT: retl 6066; 6067; X64-LABEL: test_mm256_maskz_expandloadu_epi64: 6068; X64: # %bb.0: # %entry 6069; X64-NEXT: kmovw %edi, %k1 6070; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} {z} 6071; X64-NEXT: retq 6072entry: 6073 %0 = bitcast i8* %__P to i64* 6074 %1 = bitcast i8 %__U to <8 x i1> 6075 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6076 %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> zeroinitializer) 6077 ret <4 x i64> %2 6078} 6079 6080define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, i8* readonly %__P) { 6081; X86-LABEL: test_mm_mask_expandloadu_ps: 6082; X86: # %bb.0: # %entry 6083; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6084; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6085; X86-NEXT: kmovw %ecx, %k1 6086; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} 6087; X86-NEXT: retl 6088; 6089; X64-LABEL: test_mm_mask_expandloadu_ps: 6090; X64: # %bb.0: # %entry 6091; X64-NEXT: kmovw %edi, %k1 6092; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} 6093; X64-NEXT: retq 6094entry: 6095 %0 = bitcast i8* %__P to float* 6096 %1 = bitcast i8 %__U to <8 x i1> 6097 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6098 %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> %__W) 6099 ret <4 x float> %2 6100} 6101 6102define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) { 6103; X86-LABEL: test_mm_maskz_expandloadu_ps: 6104; X86: # %bb.0: # %entry 6105; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6106; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6107; X86-NEXT: kmovw %ecx, %k1 6108; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} {z} 6109; X86-NEXT: retl 6110; 6111; X64-LABEL: test_mm_maskz_expandloadu_ps: 6112; X64: # %bb.0: # %entry 6113; X64-NEXT: kmovw %edi, %k1 6114; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} {z} 6115; X64-NEXT: retq 6116entry: 6117 %0 = bitcast i8* %__P to float* 6118 %1 = bitcast i8 %__U to <8 x i1> 6119 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6120 %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> zeroinitializer) 6121 ret <4 x float> %2 6122} 6123 6124define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, i8* readonly %__P) { 6125; X86-LABEL: test_mm256_mask_expandloadu_ps: 6126; X86: # %bb.0: # %entry 6127; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6128; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6129; X86-NEXT: kmovw %ecx, %k1 6130; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} 6131; X86-NEXT: retl 6132; 6133; X64-LABEL: test_mm256_mask_expandloadu_ps: 6134; X64: # %bb.0: # %entry 6135; X64-NEXT: kmovw %edi, %k1 6136; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} 6137; X64-NEXT: retq 6138entry: 6139 %0 = bitcast i8* %__P to float* 6140 %1 = bitcast i8 %__U to <8 x i1> 6141 %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> %__W) 6142 ret <8 x float> %2 6143} 6144 6145define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) { 6146; X86-LABEL: test_mm256_maskz_expandloadu_ps: 6147; X86: # %bb.0: # %entry 6148; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6149; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6150; X86-NEXT: kmovw %ecx, %k1 6151; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} {z} 6152; X86-NEXT: retl 6153; 6154; X64-LABEL: test_mm256_maskz_expandloadu_ps: 6155; X64: # %bb.0: # %entry 6156; X64-NEXT: kmovw %edi, %k1 6157; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} {z} 6158; X64-NEXT: retq 6159entry: 6160 %0 = bitcast i8* %__P to float* 6161 %1 = bitcast i8 %__U to <8 x i1> 6162 %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> zeroinitializer) 6163 ret <8 x float> %2 6164} 6165 6166define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6167; X86-LABEL: test_mm_mask_expandloadu_epi32: 6168; X86: # %bb.0: # %entry 6169; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6170; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6171; X86-NEXT: kmovw %ecx, %k1 6172; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} 6173; X86-NEXT: retl 6174; 6175; X64-LABEL: test_mm_mask_expandloadu_epi32: 6176; X64: # %bb.0: # %entry 6177; X64-NEXT: kmovw %edi, %k1 6178; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} 6179; X64-NEXT: retq 6180entry: 6181 %0 = bitcast <2 x i64> %__W to <4 x i32> 6182 %1 = bitcast i8* %__P to i32* 6183 %2 = bitcast i8 %__U to <8 x i1> 6184 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6185 %3 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %1, <4 x i1> %extract.i, <4 x i32> %0) 6186 %4 = bitcast <4 x i32> %3 to <2 x i64> 6187 ret <2 x i64> %4 6188} 6189 6190define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) { 6191; X86-LABEL: test_mm_maskz_expandloadu_epi32: 6192; X86: # %bb.0: # %entry 6193; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6194; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6195; X86-NEXT: kmovw %ecx, %k1 6196; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} {z} 6197; X86-NEXT: retl 6198; 6199; X64-LABEL: test_mm_maskz_expandloadu_epi32: 6200; X64: # %bb.0: # %entry 6201; X64-NEXT: kmovw %edi, %k1 6202; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} {z} 6203; X64-NEXT: retq 6204entry: 6205 %0 = bitcast i8* %__P to i32* 6206 %1 = bitcast i8 %__U to <8 x i1> 6207 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6208 %2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %0, <4 x i1> %extract.i, <4 x i32> zeroinitializer) 6209 %3 = bitcast <4 x i32> %2 to <2 x i64> 6210 ret <2 x i64> %3 6211} 6212 6213define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6214; X86-LABEL: test_mm256_mask_expandloadu_epi32: 6215; X86: # %bb.0: # %entry 6216; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6217; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6218; X86-NEXT: kmovw %ecx, %k1 6219; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} 6220; X86-NEXT: retl 6221; 6222; X64-LABEL: test_mm256_mask_expandloadu_epi32: 6223; X64: # %bb.0: # %entry 6224; X64-NEXT: kmovw %edi, %k1 6225; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} 6226; X64-NEXT: retq 6227entry: 6228 %0 = bitcast <4 x i64> %__W to <8 x i32> 6229 %1 = bitcast i8* %__P to i32* 6230 %2 = bitcast i8 %__U to <8 x i1> 6231 %3 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %1, <8 x i1> %2, <8 x i32> %0) 6232 %4 = bitcast <8 x i32> %3 to <4 x i64> 6233 ret <4 x i64> %4 6234} 6235 6236define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) { 6237; X86-LABEL: test_mm256_maskz_expandloadu_epi32: 6238; X86: # %bb.0: # %entry 6239; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6240; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6241; X86-NEXT: kmovw %ecx, %k1 6242; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} {z} 6243; X86-NEXT: retl 6244; 6245; X64-LABEL: test_mm256_maskz_expandloadu_epi32: 6246; X64: # %bb.0: # %entry 6247; X64-NEXT: kmovw %edi, %k1 6248; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} {z} 6249; X64-NEXT: retq 6250entry: 6251 %0 = bitcast i8* %__P to i32* 6252 %1 = bitcast i8 %__U to <8 x i1> 6253 %2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %0, <8 x i1> %1, <8 x i32> zeroinitializer) 6254 %3 = bitcast <8 x i32> %2 to <4 x i64> 6255 ret <4 x i64> %3 6256} 6257 6258define void @test_mm_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <2 x double> %__A) { 6259; X86-LABEL: test_mm_mask_compressstoreu_pd: 6260; X86: # %bb.0: # %entry 6261; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6262; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6263; X86-NEXT: kmovw %eax, %k1 6264; X86-NEXT: vcompresspd %xmm0, (%ecx) {%k1} 6265; X86-NEXT: retl 6266; 6267; X64-LABEL: test_mm_mask_compressstoreu_pd: 6268; X64: # %bb.0: # %entry 6269; X64-NEXT: kmovw %esi, %k1 6270; X64-NEXT: vcompresspd %xmm0, (%rdi) {%k1} 6271; X64-NEXT: retq 6272entry: 6273 %0 = bitcast i8* %__P to double* 6274 %1 = bitcast i8 %__U to <8 x i1> 6275 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6276 tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, double* %0, <2 x i1> %extract.i) 6277 ret void 6278} 6279 6280define void @test_mm256_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <4 x double> %__A) { 6281; X86-LABEL: test_mm256_mask_compressstoreu_pd: 6282; X86: # %bb.0: # %entry 6283; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6284; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6285; X86-NEXT: kmovw %eax, %k1 6286; X86-NEXT: vcompresspd %ymm0, (%ecx) {%k1} 6287; X86-NEXT: vzeroupper 6288; X86-NEXT: retl 6289; 6290; X64-LABEL: test_mm256_mask_compressstoreu_pd: 6291; X64: # %bb.0: # %entry 6292; X64-NEXT: kmovw %esi, %k1 6293; X64-NEXT: vcompresspd %ymm0, (%rdi) {%k1} 6294; X64-NEXT: vzeroupper 6295; X64-NEXT: retq 6296entry: 6297 %0 = bitcast i8* %__P to double* 6298 %1 = bitcast i8 %__U to <8 x i1> 6299 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6300 tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, double* %0, <4 x i1> %extract.i) 6301 ret void 6302} 6303 6304define void @test_mm_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) { 6305; X86-LABEL: test_mm_mask_compressstoreu_epi64: 6306; X86: # %bb.0: # %entry 6307; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6308; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6309; X86-NEXT: kmovw %eax, %k1 6310; X86-NEXT: vpcompressq %xmm0, (%ecx) {%k1} 6311; X86-NEXT: retl 6312; 6313; X64-LABEL: test_mm_mask_compressstoreu_epi64: 6314; X64: # %bb.0: # %entry 6315; X64-NEXT: kmovw %esi, %k1 6316; X64-NEXT: vpcompressq %xmm0, (%rdi) {%k1} 6317; X64-NEXT: retq 6318entry: 6319 %0 = bitcast i8* %__P to i64* 6320 %1 = bitcast i8 %__U to <8 x i1> 6321 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6322 tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, i64* %0, <2 x i1> %extract.i) 6323 ret void 6324} 6325 6326define void @test_mm256_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) { 6327; X86-LABEL: test_mm256_mask_compressstoreu_epi64: 6328; X86: # %bb.0: # %entry 6329; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6330; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6331; X86-NEXT: kmovw %eax, %k1 6332; X86-NEXT: vpcompressq %ymm0, (%ecx) {%k1} 6333; X86-NEXT: vzeroupper 6334; X86-NEXT: retl 6335; 6336; X64-LABEL: test_mm256_mask_compressstoreu_epi64: 6337; X64: # %bb.0: # %entry 6338; X64-NEXT: kmovw %esi, %k1 6339; X64-NEXT: vpcompressq %ymm0, (%rdi) {%k1} 6340; X64-NEXT: vzeroupper 6341; X64-NEXT: retq 6342entry: 6343 %0 = bitcast i8* %__P to i64* 6344 %1 = bitcast i8 %__U to <8 x i1> 6345 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6346 tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, i64* %0, <4 x i1> %extract.i) 6347 ret void 6348} 6349 6350define void @test_mm_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <4 x float> %__A) { 6351; X86-LABEL: test_mm_mask_compressstoreu_ps: 6352; X86: # %bb.0: # %entry 6353; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6354; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6355; X86-NEXT: kmovw %eax, %k1 6356; X86-NEXT: vcompressps %xmm0, (%ecx) {%k1} 6357; X86-NEXT: retl 6358; 6359; X64-LABEL: test_mm_mask_compressstoreu_ps: 6360; X64: # %bb.0: # %entry 6361; X64-NEXT: kmovw %esi, %k1 6362; X64-NEXT: vcompressps %xmm0, (%rdi) {%k1} 6363; X64-NEXT: retq 6364entry: 6365 %0 = bitcast i8* %__P to float* 6366 %1 = bitcast i8 %__U to <8 x i1> 6367 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6368 tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, float* %0, <4 x i1> %extract.i) 6369 ret void 6370} 6371 6372define void @test_mm256_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <8 x float> %__A) { 6373; X86-LABEL: test_mm256_mask_compressstoreu_ps: 6374; X86: # %bb.0: # %entry 6375; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6376; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6377; X86-NEXT: kmovw %eax, %k1 6378; X86-NEXT: vcompressps %ymm0, (%ecx) {%k1} 6379; X86-NEXT: vzeroupper 6380; X86-NEXT: retl 6381; 6382; X64-LABEL: test_mm256_mask_compressstoreu_ps: 6383; X64: # %bb.0: # %entry 6384; X64-NEXT: kmovw %esi, %k1 6385; X64-NEXT: vcompressps %ymm0, (%rdi) {%k1} 6386; X64-NEXT: vzeroupper 6387; X64-NEXT: retq 6388entry: 6389 %0 = bitcast i8* %__P to float* 6390 %1 = bitcast i8 %__U to <8 x i1> 6391 tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, float* %0, <8 x i1> %1) 6392 ret void 6393} 6394 6395define void @test_mm_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) { 6396; X86-LABEL: test_mm_mask_compressstoreu_epi32: 6397; X86: # %bb.0: # %entry 6398; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6399; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6400; X86-NEXT: kmovw %eax, %k1 6401; X86-NEXT: vpcompressd %xmm0, (%ecx) {%k1} 6402; X86-NEXT: retl 6403; 6404; X64-LABEL: test_mm_mask_compressstoreu_epi32: 6405; X64: # %bb.0: # %entry 6406; X64-NEXT: kmovw %esi, %k1 6407; X64-NEXT: vpcompressd %xmm0, (%rdi) {%k1} 6408; X64-NEXT: retq 6409entry: 6410 %0 = bitcast <2 x i64> %__A to <4 x i32> 6411 %1 = bitcast i8* %__P to i32* 6412 %2 = bitcast i8 %__U to <8 x i1> 6413 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6414 tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, i32* %1, <4 x i1> %extract.i) 6415 ret void 6416} 6417 6418define void @test_mm256_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) { 6419; X86-LABEL: test_mm256_mask_compressstoreu_epi32: 6420; X86: # %bb.0: # %entry 6421; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6422; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6423; X86-NEXT: kmovw %eax, %k1 6424; X86-NEXT: vpcompressd %ymm0, (%ecx) {%k1} 6425; X86-NEXT: vzeroupper 6426; X86-NEXT: retl 6427; 6428; X64-LABEL: test_mm256_mask_compressstoreu_epi32: 6429; X64: # %bb.0: # %entry 6430; X64-NEXT: kmovw %esi, %k1 6431; X64-NEXT: vpcompressd %ymm0, (%rdi) {%k1} 6432; X64-NEXT: vzeroupper 6433; X64-NEXT: retq 6434entry: 6435 %0 = bitcast <4 x i64> %__A to <8 x i32> 6436 %1 = bitcast i8* %__P to i32* 6437 %2 = bitcast i8 %__U to <8 x i1> 6438 tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, i32* %1, <8 x i1> %2) #10 6439 ret void 6440} 6441 6442 6443declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8 6444declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8 6445declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8 6446declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8 6447 6448define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) { 6449; X86-LABEL: test_mm_mask_sqrt_pd: 6450; X86: # %bb.0: # %entry 6451; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6452; X86-NEXT: kmovw %eax, %k1 6453; X86-NEXT: vsqrtpd %xmm1, %xmm0 {%k1} 6454; X86-NEXT: retl 6455; 6456; X64-LABEL: test_mm_mask_sqrt_pd: 6457; X64: # %bb.0: # %entry 6458; X64-NEXT: kmovw %edi, %k1 6459; X64-NEXT: vsqrtpd %xmm1, %xmm0 {%k1} 6460; X64-NEXT: retq 6461entry: 6462 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2 6463 %1 = bitcast i8 %__U to <8 x i1> 6464 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6465 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W 6466 ret <2 x double> %2 6467} 6468 6469declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) 6470 6471define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) { 6472; X86-LABEL: test_mm_maskz_sqrt_pd: 6473; X86: # %bb.0: # %entry 6474; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6475; X86-NEXT: kmovw %eax, %k1 6476; X86-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z} 6477; X86-NEXT: retl 6478; 6479; X64-LABEL: test_mm_maskz_sqrt_pd: 6480; X64: # %bb.0: # %entry 6481; X64-NEXT: kmovw %edi, %k1 6482; X64-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z} 6483; X64-NEXT: retq 6484entry: 6485 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2 6486 %1 = bitcast i8 %__U to <8 x i1> 6487 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6488 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 6489 ret <2 x double> %2 6490} 6491 6492define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) { 6493; X86-LABEL: test_mm256_mask_sqrt_pd: 6494; X86: # %bb.0: # %entry 6495; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6496; X86-NEXT: kmovw %eax, %k1 6497; X86-NEXT: vsqrtpd %ymm1, %ymm0 {%k1} 6498; X86-NEXT: retl 6499; 6500; X64-LABEL: test_mm256_mask_sqrt_pd: 6501; X64: # %bb.0: # %entry 6502; X64-NEXT: kmovw %edi, %k1 6503; X64-NEXT: vsqrtpd %ymm1, %ymm0 {%k1} 6504; X64-NEXT: retq 6505entry: 6506 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2 6507 %1 = bitcast i8 %__U to <8 x i1> 6508 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6509 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W 6510 ret <4 x double> %2 6511} 6512 6513declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) 6514 6515define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) { 6516; X86-LABEL: test_mm256_maskz_sqrt_pd: 6517; X86: # %bb.0: # %entry 6518; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6519; X86-NEXT: kmovw %eax, %k1 6520; X86-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} 6521; X86-NEXT: retl 6522; 6523; X64-LABEL: test_mm256_maskz_sqrt_pd: 6524; X64: # %bb.0: # %entry 6525; X64-NEXT: kmovw %edi, %k1 6526; X64-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} 6527; X64-NEXT: retq 6528entry: 6529 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2 6530 %1 = bitcast i8 %__U to <8 x i1> 6531 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6532 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 6533 ret <4 x double> %2 6534} 6535 6536define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { 6537; X86-LABEL: test_mm_mask_sqrt_ps: 6538; X86: # %bb.0: # %entry 6539; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6540; X86-NEXT: kmovw %eax, %k1 6541; X86-NEXT: vsqrtps %xmm1, %xmm0 {%k1} 6542; X86-NEXT: retl 6543; 6544; X64-LABEL: test_mm_mask_sqrt_ps: 6545; X64: # %bb.0: # %entry 6546; X64-NEXT: kmovw %edi, %k1 6547; X64-NEXT: vsqrtps %xmm1, %xmm0 {%k1} 6548; X64-NEXT: retq 6549entry: 6550 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2 6551 %1 = bitcast i8 %__U to <8 x i1> 6552 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6553 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W 6554 ret <4 x float> %2 6555} 6556 6557declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) 6558 6559define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) { 6560; X86-LABEL: test_mm_maskz_sqrt_ps: 6561; X86: # %bb.0: # %entry 6562; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6563; X86-NEXT: kmovw %eax, %k1 6564; X86-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z} 6565; X86-NEXT: retl 6566; 6567; X64-LABEL: test_mm_maskz_sqrt_ps: 6568; X64: # %bb.0: # %entry 6569; X64-NEXT: kmovw %edi, %k1 6570; X64-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z} 6571; X64-NEXT: retq 6572entry: 6573 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2 6574 %1 = bitcast i8 %__U to <8 x i1> 6575 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6576 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 6577 ret <4 x float> %2 6578} 6579 6580define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) { 6581; X86-LABEL: test_mm256_mask_sqrt_ps: 6582; X86: # %bb.0: # %entry 6583; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6584; X86-NEXT: kmovw %eax, %k1 6585; X86-NEXT: vsqrtps %ymm1, %ymm0 {%k1} 6586; X86-NEXT: retl 6587; 6588; X64-LABEL: test_mm256_mask_sqrt_ps: 6589; X64: # %bb.0: # %entry 6590; X64-NEXT: kmovw %edi, %k1 6591; X64-NEXT: vsqrtps %ymm1, %ymm0 {%k1} 6592; X64-NEXT: retq 6593entry: 6594 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2 6595 %1 = bitcast i8 %__U to <8 x i1> 6596 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W 6597 ret <8 x float> %2 6598} 6599 6600define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) { 6601; X86-LABEL: test_mm256_maskz_sqrt_ps: 6602; X86: # %bb.0: # %entry 6603; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6604; X86-NEXT: kmovw %eax, %k1 6605; X86-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} 6606; X86-NEXT: retl 6607; 6608; X64-LABEL: test_mm256_maskz_sqrt_ps: 6609; X64: # %bb.0: # %entry 6610; X64-NEXT: kmovw %edi, %k1 6611; X64-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} 6612; X64-NEXT: retq 6613entry: 6614 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2 6615 %1 = bitcast i8 %__U to <8 x i1> 6616 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 6617 ret <8 x float> %2 6618} 6619 6620declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) 6621 6622define <2 x i64> @test_mm_rol_epi32(<2 x i64> %__A) { 6623; CHECK-LABEL: test_mm_rol_epi32: 6624; CHECK: # %bb.0: # %entry 6625; CHECK-NEXT: vprold $5, %xmm0, %xmm0 6626; CHECK-NEXT: ret{{[l|q]}} 6627entry: 6628 %0 = bitcast <2 x i64> %__A to <4 x i32> 6629 %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5) 6630 %2 = bitcast <4 x i32> %1 to <2 x i64> 6631 ret <2 x i64> %2 6632} 6633 6634declare <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32>, i32) 6635 6636define <2 x i64> @test_mm_mask_rol_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 6637; X86-LABEL: test_mm_mask_rol_epi32: 6638; X86: # %bb.0: # %entry 6639; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6640; X86-NEXT: kmovw %eax, %k1 6641; X86-NEXT: vprold $5, %xmm1, %xmm0 {%k1} 6642; X86-NEXT: retl 6643; 6644; X64-LABEL: test_mm_mask_rol_epi32: 6645; X64: # %bb.0: # %entry 6646; X64-NEXT: kmovw %edi, %k1 6647; X64-NEXT: vprold $5, %xmm1, %xmm0 {%k1} 6648; X64-NEXT: retq 6649entry: 6650 %0 = bitcast <2 x i64> %__A to <4 x i32> 6651 %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5) 6652 %2 = bitcast <2 x i64> %__W to <4 x i32> 6653 %3 = bitcast i8 %__U to <8 x i1> 6654 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6655 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2 6656 %5 = bitcast <4 x i32> %4 to <2 x i64> 6657 ret <2 x i64> %5 6658} 6659 6660define <2 x i64> @test_mm_maskz_rol_epi32(i8 zeroext %__U, <2 x i64> %__A) { 6661; X86-LABEL: test_mm_maskz_rol_epi32: 6662; X86: # %bb.0: # %entry 6663; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6664; X86-NEXT: kmovw %eax, %k1 6665; X86-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z} 6666; X86-NEXT: retl 6667; 6668; X64-LABEL: test_mm_maskz_rol_epi32: 6669; X64: # %bb.0: # %entry 6670; X64-NEXT: kmovw %edi, %k1 6671; X64-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z} 6672; X64-NEXT: retq 6673entry: 6674 %0 = bitcast <2 x i64> %__A to <4 x i32> 6675 %1 = tail call <4 x i32> @llvm.x86.avx512.prol.d.128(<4 x i32> %0, i32 5) 6676 %2 = bitcast i8 %__U to <8 x i1> 6677 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6678 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer 6679 %4 = bitcast <4 x i32> %3 to <2 x i64> 6680 ret <2 x i64> %4 6681} 6682 6683define <4 x i64> @test_mm256_rol_epi32(<4 x i64> %__A) { 6684; CHECK-LABEL: test_mm256_rol_epi32: 6685; CHECK: # %bb.0: # %entry 6686; CHECK-NEXT: vprold $5, %ymm0, %ymm0 6687; CHECK-NEXT: ret{{[l|q]}} 6688entry: 6689 %0 = bitcast <4 x i64> %__A to <8 x i32> 6690 %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5) 6691 %2 = bitcast <8 x i32> %1 to <4 x i64> 6692 ret <4 x i64> %2 6693} 6694 6695declare <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32>, i32) 6696 6697define <4 x i64> @test_mm256_mask_rol_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 6698; X86-LABEL: test_mm256_mask_rol_epi32: 6699; X86: # %bb.0: # %entry 6700; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6701; X86-NEXT: kmovw %eax, %k1 6702; X86-NEXT: vprold $5, %ymm1, %ymm0 {%k1} 6703; X86-NEXT: retl 6704; 6705; X64-LABEL: test_mm256_mask_rol_epi32: 6706; X64: # %bb.0: # %entry 6707; X64-NEXT: kmovw %edi, %k1 6708; X64-NEXT: vprold $5, %ymm1, %ymm0 {%k1} 6709; X64-NEXT: retq 6710entry: 6711 %0 = bitcast <4 x i64> %__A to <8 x i32> 6712 %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5) 6713 %2 = bitcast <4 x i64> %__W to <8 x i32> 6714 %3 = bitcast i8 %__U to <8 x i1> 6715 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2 6716 %5 = bitcast <8 x i32> %4 to <4 x i64> 6717 ret <4 x i64> %5 6718} 6719 6720define <4 x i64> @test_mm256_maskz_rol_epi32(i8 zeroext %__U, <4 x i64> %__A) { 6721; X86-LABEL: test_mm256_maskz_rol_epi32: 6722; X86: # %bb.0: # %entry 6723; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6724; X86-NEXT: kmovw %eax, %k1 6725; X86-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z} 6726; X86-NEXT: retl 6727; 6728; X64-LABEL: test_mm256_maskz_rol_epi32: 6729; X64: # %bb.0: # %entry 6730; X64-NEXT: kmovw %edi, %k1 6731; X64-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z} 6732; X64-NEXT: retq 6733entry: 6734 %0 = bitcast <4 x i64> %__A to <8 x i32> 6735 %1 = tail call <8 x i32> @llvm.x86.avx512.prol.d.256(<8 x i32> %0, i32 5) 6736 %2 = bitcast i8 %__U to <8 x i1> 6737 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer 6738 %4 = bitcast <8 x i32> %3 to <4 x i64> 6739 ret <4 x i64> %4 6740} 6741 6742define <2 x i64> @test_mm_rol_epi64(<2 x i64> %__A) { 6743; CHECK-LABEL: test_mm_rol_epi64: 6744; CHECK: # %bb.0: # %entry 6745; CHECK-NEXT: vprolq $5, %xmm0, %xmm0 6746; CHECK-NEXT: ret{{[l|q]}} 6747entry: 6748 %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5) 6749 ret <2 x i64> %0 6750} 6751 6752declare <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64>, i32) 6753 6754define <2 x i64> @test_mm_mask_rol_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 6755; X86-LABEL: test_mm_mask_rol_epi64: 6756; X86: # %bb.0: # %entry 6757; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6758; X86-NEXT: kmovw %eax, %k1 6759; X86-NEXT: vprolq $5, %xmm1, %xmm0 {%k1} 6760; X86-NEXT: retl 6761; 6762; X64-LABEL: test_mm_mask_rol_epi64: 6763; X64: # %bb.0: # %entry 6764; X64-NEXT: kmovw %edi, %k1 6765; X64-NEXT: vprolq $5, %xmm1, %xmm0 {%k1} 6766; X64-NEXT: retq 6767entry: 6768 %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5) 6769 %1 = bitcast i8 %__U to <8 x i1> 6770 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6771 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W 6772 ret <2 x i64> %2 6773} 6774 6775define <2 x i64> @test_mm_maskz_rol_epi64(i8 zeroext %__U, <2 x i64> %__A) { 6776; X86-LABEL: test_mm_maskz_rol_epi64: 6777; X86: # %bb.0: # %entry 6778; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6779; X86-NEXT: kmovw %eax, %k1 6780; X86-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z} 6781; X86-NEXT: retl 6782; 6783; X64-LABEL: test_mm_maskz_rol_epi64: 6784; X64: # %bb.0: # %entry 6785; X64-NEXT: kmovw %edi, %k1 6786; X64-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z} 6787; X64-NEXT: retq 6788entry: 6789 %0 = tail call <2 x i64> @llvm.x86.avx512.prol.q.128(<2 x i64> %__A, i32 5) 6790 %1 = bitcast i8 %__U to <8 x i1> 6791 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6792 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer 6793 ret <2 x i64> %2 6794} 6795 6796define <4 x i64> @test_mm256_rol_epi64(<4 x i64> %__A) { 6797; CHECK-LABEL: test_mm256_rol_epi64: 6798; CHECK: # %bb.0: # %entry 6799; CHECK-NEXT: vprolq $5, %ymm0, %ymm0 6800; CHECK-NEXT: ret{{[l|q]}} 6801entry: 6802 %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5) 6803 ret <4 x i64> %0 6804} 6805 6806declare <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64>, i32) 6807 6808define <4 x i64> @test_mm256_mask_rol_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 6809; X86-LABEL: test_mm256_mask_rol_epi64: 6810; X86: # %bb.0: # %entry 6811; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6812; X86-NEXT: kmovw %eax, %k1 6813; X86-NEXT: vprolq $5, %ymm1, %ymm0 {%k1} 6814; X86-NEXT: retl 6815; 6816; X64-LABEL: test_mm256_mask_rol_epi64: 6817; X64: # %bb.0: # %entry 6818; X64-NEXT: kmovw %edi, %k1 6819; X64-NEXT: vprolq $5, %ymm1, %ymm0 {%k1} 6820; X64-NEXT: retq 6821entry: 6822 %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5) 6823 %1 = bitcast i8 %__U to <8 x i1> 6824 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6825 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W 6826 ret <4 x i64> %2 6827} 6828 6829define <4 x i64> @test_mm256_maskz_rol_epi64(i8 zeroext %__U, <4 x i64> %__A) { 6830; X86-LABEL: test_mm256_maskz_rol_epi64: 6831; X86: # %bb.0: # %entry 6832; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6833; X86-NEXT: kmovw %eax, %k1 6834; X86-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z} 6835; X86-NEXT: retl 6836; 6837; X64-LABEL: test_mm256_maskz_rol_epi64: 6838; X64: # %bb.0: # %entry 6839; X64-NEXT: kmovw %edi, %k1 6840; X64-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z} 6841; X64-NEXT: retq 6842entry: 6843 %0 = tail call <4 x i64> @llvm.x86.avx512.prol.q.256(<4 x i64> %__A, i32 5) 6844 %1 = bitcast i8 %__U to <8 x i1> 6845 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6846 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer 6847 ret <4 x i64> %2 6848} 6849 6850define <2 x i64> @test_mm_rolv_epi32(<2 x i64> %__A, <2 x i64> %__B) { 6851; CHECK-LABEL: test_mm_rolv_epi32: 6852; CHECK: # %bb.0: # %entry 6853; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0 6854; CHECK-NEXT: ret{{[l|q]}} 6855entry: 6856 %0 = bitcast <2 x i64> %__A to <4 x i32> 6857 %1 = bitcast <2 x i64> %__B to <4 x i32> 6858 %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1) 6859 %3 = bitcast <4 x i32> %2 to <2 x i64> 6860 ret <2 x i64> %3 6861} 6862 6863define <2 x i64> @test_mm_mask_rolv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 6864; X86-LABEL: test_mm_mask_rolv_epi32: 6865; X86: # %bb.0: # %entry 6866; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6867; X86-NEXT: kmovw %eax, %k1 6868; X86-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1} 6869; X86-NEXT: retl 6870; 6871; X64-LABEL: test_mm_mask_rolv_epi32: 6872; X64: # %bb.0: # %entry 6873; X64-NEXT: kmovw %edi, %k1 6874; X64-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1} 6875; X64-NEXT: retq 6876entry: 6877 %0 = bitcast <2 x i64> %__A to <4 x i32> 6878 %1 = bitcast <2 x i64> %__B to <4 x i32> 6879 %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1) 6880 %3 = bitcast <2 x i64> %__W to <4 x i32> 6881 %4 = bitcast i8 %__U to <8 x i1> 6882 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6883 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3 6884 %6 = bitcast <4 x i32> %5 to <2 x i64> 6885 ret <2 x i64> %6 6886} 6887 6888define <2 x i64> @test_mm_maskz_rolv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 6889; X86-LABEL: test_mm_maskz_rolv_epi32: 6890; X86: # %bb.0: # %entry 6891; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6892; X86-NEXT: kmovw %eax, %k1 6893; X86-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} 6894; X86-NEXT: retl 6895; 6896; X64-LABEL: test_mm_maskz_rolv_epi32: 6897; X64: # %bb.0: # %entry 6898; X64-NEXT: kmovw %edi, %k1 6899; X64-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} 6900; X64-NEXT: retq 6901entry: 6902 %0 = bitcast <2 x i64> %__A to <4 x i32> 6903 %1 = bitcast <2 x i64> %__B to <4 x i32> 6904 %2 = tail call <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32> %0, <4 x i32> %1) 6905 %3 = bitcast i8 %__U to <8 x i1> 6906 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6907 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer 6908 %5 = bitcast <4 x i32> %4 to <2 x i64> 6909 ret <2 x i64> %5 6910} 6911 6912define <4 x i64> @test_mm256_rolv_epi32(<4 x i64> %__A, <4 x i64> %__B) { 6913; CHECK-LABEL: test_mm256_rolv_epi32: 6914; CHECK: # %bb.0: # %entry 6915; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0 6916; CHECK-NEXT: ret{{[l|q]}} 6917entry: 6918 %0 = bitcast <4 x i64> %__A to <8 x i32> 6919 %1 = bitcast <4 x i64> %__B to <8 x i32> 6920 %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1) 6921 %3 = bitcast <8 x i32> %2 to <4 x i64> 6922 ret <4 x i64> %3 6923} 6924 6925define <4 x i64> @test_mm256_mask_rolv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 6926; X86-LABEL: test_mm256_mask_rolv_epi32: 6927; X86: # %bb.0: # %entry 6928; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6929; X86-NEXT: kmovw %eax, %k1 6930; X86-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1} 6931; X86-NEXT: retl 6932; 6933; X64-LABEL: test_mm256_mask_rolv_epi32: 6934; X64: # %bb.0: # %entry 6935; X64-NEXT: kmovw %edi, %k1 6936; X64-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1} 6937; X64-NEXT: retq 6938entry: 6939 %0 = bitcast <4 x i64> %__A to <8 x i32> 6940 %1 = bitcast <4 x i64> %__B to <8 x i32> 6941 %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1) 6942 %3 = bitcast <4 x i64> %__W to <8 x i32> 6943 %4 = bitcast i8 %__U to <8 x i1> 6944 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3 6945 %6 = bitcast <8 x i32> %5 to <4 x i64> 6946 ret <4 x i64> %6 6947} 6948 6949define <4 x i64> @test_mm256_maskz_rolv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 6950; X86-LABEL: test_mm256_maskz_rolv_epi32: 6951; X86: # %bb.0: # %entry 6952; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6953; X86-NEXT: kmovw %eax, %k1 6954; X86-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} 6955; X86-NEXT: retl 6956; 6957; X64-LABEL: test_mm256_maskz_rolv_epi32: 6958; X64: # %bb.0: # %entry 6959; X64-NEXT: kmovw %edi, %k1 6960; X64-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} 6961; X64-NEXT: retq 6962entry: 6963 %0 = bitcast <4 x i64> %__A to <8 x i32> 6964 %1 = bitcast <4 x i64> %__B to <8 x i32> 6965 %2 = tail call <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32> %0, <8 x i32> %1) 6966 %3 = bitcast i8 %__U to <8 x i1> 6967 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 6968 %5 = bitcast <8 x i32> %4 to <4 x i64> 6969 ret <4 x i64> %5 6970} 6971 6972define <2 x i64> @test_mm_rolv_epi64(<2 x i64> %__A, <2 x i64> %__B) { 6973; CHECK-LABEL: test_mm_rolv_epi64: 6974; CHECK: # %bb.0: # %entry 6975; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0 6976; CHECK-NEXT: ret{{[l|q]}} 6977entry: 6978 %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B) 6979 ret <2 x i64> %0 6980} 6981 6982define <2 x i64> @test_mm_mask_rolv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 6983; X86-LABEL: test_mm_mask_rolv_epi64: 6984; X86: # %bb.0: # %entry 6985; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6986; X86-NEXT: kmovw %eax, %k1 6987; X86-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1} 6988; X86-NEXT: retl 6989; 6990; X64-LABEL: test_mm_mask_rolv_epi64: 6991; X64: # %bb.0: # %entry 6992; X64-NEXT: kmovw %edi, %k1 6993; X64-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1} 6994; X64-NEXT: retq 6995entry: 6996 %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B) 6997 %1 = bitcast i8 %__U to <8 x i1> 6998 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6999 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W 7000 ret <2 x i64> %2 7001} 7002 7003define <2 x i64> @test_mm_maskz_rolv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7004; X86-LABEL: test_mm_maskz_rolv_epi64: 7005; X86: # %bb.0: # %entry 7006; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7007; X86-NEXT: kmovw %eax, %k1 7008; X86-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7009; X86-NEXT: retl 7010; 7011; X64-LABEL: test_mm_maskz_rolv_epi64: 7012; X64: # %bb.0: # %entry 7013; X64-NEXT: kmovw %edi, %k1 7014; X64-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7015; X64-NEXT: retq 7016entry: 7017 %0 = tail call <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64> %__A, <2 x i64> %__B) 7018 %1 = bitcast i8 %__U to <8 x i1> 7019 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7020 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer 7021 ret <2 x i64> %2 7022} 7023 7024define <4 x i64> @test_mm256_rolv_epi64(<4 x i64> %__A, <4 x i64> %__B) { 7025; CHECK-LABEL: test_mm256_rolv_epi64: 7026; CHECK: # %bb.0: # %entry 7027; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0 7028; CHECK-NEXT: ret{{[l|q]}} 7029entry: 7030 %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7031 ret <4 x i64> %0 7032} 7033 7034define <4 x i64> @test_mm256_mask_rolv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7035; X86-LABEL: test_mm256_mask_rolv_epi64: 7036; X86: # %bb.0: # %entry 7037; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7038; X86-NEXT: kmovw %eax, %k1 7039; X86-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1} 7040; X86-NEXT: retl 7041; 7042; X64-LABEL: test_mm256_mask_rolv_epi64: 7043; X64: # %bb.0: # %entry 7044; X64-NEXT: kmovw %edi, %k1 7045; X64-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1} 7046; X64-NEXT: retq 7047entry: 7048 %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7049 %1 = bitcast i8 %__U to <8 x i1> 7050 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7051 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W 7052 ret <4 x i64> %2 7053} 7054 7055define <4 x i64> @test_mm256_maskz_rolv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7056; X86-LABEL: test_mm256_maskz_rolv_epi64: 7057; X86: # %bb.0: # %entry 7058; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7059; X86-NEXT: kmovw %eax, %k1 7060; X86-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7061; X86-NEXT: retl 7062; 7063; X64-LABEL: test_mm256_maskz_rolv_epi64: 7064; X64: # %bb.0: # %entry 7065; X64-NEXT: kmovw %edi, %k1 7066; X64-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7067; X64-NEXT: retq 7068entry: 7069 %0 = tail call <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7070 %1 = bitcast i8 %__U to <8 x i1> 7071 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7072 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer 7073 ret <4 x i64> %2 7074} 7075 7076define <2 x i64> @test_mm_ror_epi32(<2 x i64> %__A) { 7077; CHECK-LABEL: test_mm_ror_epi32: 7078; CHECK: # %bb.0: # %entry 7079; CHECK-NEXT: vprord $5, %xmm0, %xmm0 7080; CHECK-NEXT: ret{{[l|q]}} 7081entry: 7082 %0 = bitcast <2 x i64> %__A to <4 x i32> 7083 %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5) 7084 %2 = bitcast <4 x i32> %1 to <2 x i64> 7085 ret <2 x i64> %2 7086} 7087 7088declare <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32>, i32) 7089 7090define <2 x i64> @test_mm_mask_ror_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 7091; X86-LABEL: test_mm_mask_ror_epi32: 7092; X86: # %bb.0: # %entry 7093; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7094; X86-NEXT: kmovw %eax, %k1 7095; X86-NEXT: vprord $5, %xmm1, %xmm0 {%k1} 7096; X86-NEXT: retl 7097; 7098; X64-LABEL: test_mm_mask_ror_epi32: 7099; X64: # %bb.0: # %entry 7100; X64-NEXT: kmovw %edi, %k1 7101; X64-NEXT: vprord $5, %xmm1, %xmm0 {%k1} 7102; X64-NEXT: retq 7103entry: 7104 %0 = bitcast <2 x i64> %__A to <4 x i32> 7105 %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5) 7106 %2 = bitcast <2 x i64> %__W to <4 x i32> 7107 %3 = bitcast i8 %__U to <8 x i1> 7108 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7109 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2 7110 %5 = bitcast <4 x i32> %4 to <2 x i64> 7111 ret <2 x i64> %5 7112} 7113 7114define <2 x i64> @test_mm_maskz_ror_epi32(i8 zeroext %__U, <2 x i64> %__A) { 7115; X86-LABEL: test_mm_maskz_ror_epi32: 7116; X86: # %bb.0: # %entry 7117; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7118; X86-NEXT: kmovw %eax, %k1 7119; X86-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z} 7120; X86-NEXT: retl 7121; 7122; X64-LABEL: test_mm_maskz_ror_epi32: 7123; X64: # %bb.0: # %entry 7124; X64-NEXT: kmovw %edi, %k1 7125; X64-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z} 7126; X64-NEXT: retq 7127entry: 7128 %0 = bitcast <2 x i64> %__A to <4 x i32> 7129 %1 = tail call <4 x i32> @llvm.x86.avx512.pror.d.128(<4 x i32> %0, i32 5) 7130 %2 = bitcast i8 %__U to <8 x i1> 7131 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7132 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer 7133 %4 = bitcast <4 x i32> %3 to <2 x i64> 7134 ret <2 x i64> %4 7135} 7136 7137define <4 x i64> @test_mm256_ror_epi32(<4 x i64> %__A) { 7138; CHECK-LABEL: test_mm256_ror_epi32: 7139; CHECK: # %bb.0: # %entry 7140; CHECK-NEXT: vprord $5, %ymm0, %ymm0 7141; CHECK-NEXT: ret{{[l|q]}} 7142entry: 7143 %0 = bitcast <4 x i64> %__A to <8 x i32> 7144 %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5) 7145 %2 = bitcast <8 x i32> %1 to <4 x i64> 7146 ret <4 x i64> %2 7147} 7148 7149declare <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32>, i32) 7150 7151define <4 x i64> @test_mm256_mask_ror_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 7152; X86-LABEL: test_mm256_mask_ror_epi32: 7153; X86: # %bb.0: # %entry 7154; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7155; X86-NEXT: kmovw %eax, %k1 7156; X86-NEXT: vprord $5, %ymm1, %ymm0 {%k1} 7157; X86-NEXT: retl 7158; 7159; X64-LABEL: test_mm256_mask_ror_epi32: 7160; X64: # %bb.0: # %entry 7161; X64-NEXT: kmovw %edi, %k1 7162; X64-NEXT: vprord $5, %ymm1, %ymm0 {%k1} 7163; X64-NEXT: retq 7164entry: 7165 %0 = bitcast <4 x i64> %__A to <8 x i32> 7166 %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5) 7167 %2 = bitcast <4 x i64> %__W to <8 x i32> 7168 %3 = bitcast i8 %__U to <8 x i1> 7169 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2 7170 %5 = bitcast <8 x i32> %4 to <4 x i64> 7171 ret <4 x i64> %5 7172} 7173 7174define <4 x i64> @test_mm256_maskz_ror_epi32(i8 zeroext %__U, <4 x i64> %__A) { 7175; X86-LABEL: test_mm256_maskz_ror_epi32: 7176; X86: # %bb.0: # %entry 7177; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7178; X86-NEXT: kmovw %eax, %k1 7179; X86-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z} 7180; X86-NEXT: retl 7181; 7182; X64-LABEL: test_mm256_maskz_ror_epi32: 7183; X64: # %bb.0: # %entry 7184; X64-NEXT: kmovw %edi, %k1 7185; X64-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z} 7186; X64-NEXT: retq 7187entry: 7188 %0 = bitcast <4 x i64> %__A to <8 x i32> 7189 %1 = tail call <8 x i32> @llvm.x86.avx512.pror.d.256(<8 x i32> %0, i32 5) 7190 %2 = bitcast i8 %__U to <8 x i1> 7191 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer 7192 %4 = bitcast <8 x i32> %3 to <4 x i64> 7193 ret <4 x i64> %4 7194} 7195 7196define <2 x i64> @test_mm_ror_epi64(<2 x i64> %__A) { 7197; CHECK-LABEL: test_mm_ror_epi64: 7198; CHECK: # %bb.0: # %entry 7199; CHECK-NEXT: vprorq $5, %xmm0, %xmm0 7200; CHECK-NEXT: ret{{[l|q]}} 7201entry: 7202 %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5) 7203 ret <2 x i64> %0 7204} 7205 7206declare <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64>, i32) 7207 7208define <2 x i64> @test_mm_mask_ror_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 7209; X86-LABEL: test_mm_mask_ror_epi64: 7210; X86: # %bb.0: # %entry 7211; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7212; X86-NEXT: kmovw %eax, %k1 7213; X86-NEXT: vprorq $5, %xmm1, %xmm0 {%k1} 7214; X86-NEXT: retl 7215; 7216; X64-LABEL: test_mm_mask_ror_epi64: 7217; X64: # %bb.0: # %entry 7218; X64-NEXT: kmovw %edi, %k1 7219; X64-NEXT: vprorq $5, %xmm1, %xmm0 {%k1} 7220; X64-NEXT: retq 7221entry: 7222 %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5) 7223 %1 = bitcast i8 %__U to <8 x i1> 7224 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7225 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W 7226 ret <2 x i64> %2 7227} 7228 7229define <2 x i64> @test_mm_maskz_ror_epi64(i8 zeroext %__U, <2 x i64> %__A) { 7230; X86-LABEL: test_mm_maskz_ror_epi64: 7231; X86: # %bb.0: # %entry 7232; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7233; X86-NEXT: kmovw %eax, %k1 7234; X86-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z} 7235; X86-NEXT: retl 7236; 7237; X64-LABEL: test_mm_maskz_ror_epi64: 7238; X64: # %bb.0: # %entry 7239; X64-NEXT: kmovw %edi, %k1 7240; X64-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z} 7241; X64-NEXT: retq 7242entry: 7243 %0 = tail call <2 x i64> @llvm.x86.avx512.pror.q.128(<2 x i64> %__A, i32 5) 7244 %1 = bitcast i8 %__U to <8 x i1> 7245 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7246 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer 7247 ret <2 x i64> %2 7248} 7249 7250define <4 x i64> @test_mm256_ror_epi64(<4 x i64> %__A) { 7251; CHECK-LABEL: test_mm256_ror_epi64: 7252; CHECK: # %bb.0: # %entry 7253; CHECK-NEXT: vprorq $5, %ymm0, %ymm0 7254; CHECK-NEXT: ret{{[l|q]}} 7255entry: 7256 %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5) 7257 ret <4 x i64> %0 7258} 7259 7260declare <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64>, i32) 7261 7262define <4 x i64> @test_mm256_mask_ror_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 7263; X86-LABEL: test_mm256_mask_ror_epi64: 7264; X86: # %bb.0: # %entry 7265; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7266; X86-NEXT: kmovw %eax, %k1 7267; X86-NEXT: vprorq $5, %ymm1, %ymm0 {%k1} 7268; X86-NEXT: retl 7269; 7270; X64-LABEL: test_mm256_mask_ror_epi64: 7271; X64: # %bb.0: # %entry 7272; X64-NEXT: kmovw %edi, %k1 7273; X64-NEXT: vprorq $5, %ymm1, %ymm0 {%k1} 7274; X64-NEXT: retq 7275entry: 7276 %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5) 7277 %1 = bitcast i8 %__U to <8 x i1> 7278 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7279 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W 7280 ret <4 x i64> %2 7281} 7282 7283define <4 x i64> @test_mm256_maskz_ror_epi64(i8 zeroext %__U, <4 x i64> %__A) { 7284; X86-LABEL: test_mm256_maskz_ror_epi64: 7285; X86: # %bb.0: # %entry 7286; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7287; X86-NEXT: kmovw %eax, %k1 7288; X86-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z} 7289; X86-NEXT: retl 7290; 7291; X64-LABEL: test_mm256_maskz_ror_epi64: 7292; X64: # %bb.0: # %entry 7293; X64-NEXT: kmovw %edi, %k1 7294; X64-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z} 7295; X64-NEXT: retq 7296entry: 7297 %0 = tail call <4 x i64> @llvm.x86.avx512.pror.q.256(<4 x i64> %__A, i32 5) 7298 %1 = bitcast i8 %__U to <8 x i1> 7299 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7300 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer 7301 ret <4 x i64> %2 7302} 7303 7304define <2 x i64> @test_mm_rorv_epi32(<2 x i64> %__A, <2 x i64> %__B) { 7305; CHECK-LABEL: test_mm_rorv_epi32: 7306; CHECK: # %bb.0: # %entry 7307; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0 7308; CHECK-NEXT: ret{{[l|q]}} 7309entry: 7310 %0 = bitcast <2 x i64> %__A to <4 x i32> 7311 %1 = bitcast <2 x i64> %__B to <4 x i32> 7312 %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1) 7313 %3 = bitcast <4 x i32> %2 to <2 x i64> 7314 ret <2 x i64> %3 7315} 7316 7317define <2 x i64> @test_mm_mask_rorv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7318; X86-LABEL: test_mm_mask_rorv_epi32: 7319; X86: # %bb.0: # %entry 7320; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7321; X86-NEXT: kmovw %eax, %k1 7322; X86-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1} 7323; X86-NEXT: retl 7324; 7325; X64-LABEL: test_mm_mask_rorv_epi32: 7326; X64: # %bb.0: # %entry 7327; X64-NEXT: kmovw %edi, %k1 7328; X64-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1} 7329; X64-NEXT: retq 7330entry: 7331 %0 = bitcast <2 x i64> %__A to <4 x i32> 7332 %1 = bitcast <2 x i64> %__B to <4 x i32> 7333 %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1) 7334 %3 = bitcast <2 x i64> %__W to <4 x i32> 7335 %4 = bitcast i8 %__U to <8 x i1> 7336 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7337 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3 7338 %6 = bitcast <4 x i32> %5 to <2 x i64> 7339 ret <2 x i64> %6 7340} 7341 7342define <2 x i64> @test_mm_maskz_rorv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7343; X86-LABEL: test_mm_maskz_rorv_epi32: 7344; X86: # %bb.0: # %entry 7345; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7346; X86-NEXT: kmovw %eax, %k1 7347; X86-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} 7348; X86-NEXT: retl 7349; 7350; X64-LABEL: test_mm_maskz_rorv_epi32: 7351; X64: # %bb.0: # %entry 7352; X64-NEXT: kmovw %edi, %k1 7353; X64-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} 7354; X64-NEXT: retq 7355entry: 7356 %0 = bitcast <2 x i64> %__A to <4 x i32> 7357 %1 = bitcast <2 x i64> %__B to <4 x i32> 7358 %2 = tail call <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32> %0, <4 x i32> %1) 7359 %3 = bitcast i8 %__U to <8 x i1> 7360 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7361 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer 7362 %5 = bitcast <4 x i32> %4 to <2 x i64> 7363 ret <2 x i64> %5 7364} 7365 7366define <4 x i64> @test_mm256_rorv_epi32(<4 x i64> %__A, <4 x i64> %__B) { 7367; CHECK-LABEL: test_mm256_rorv_epi32: 7368; CHECK: # %bb.0: # %entry 7369; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0 7370; CHECK-NEXT: ret{{[l|q]}} 7371entry: 7372 %0 = bitcast <4 x i64> %__A to <8 x i32> 7373 %1 = bitcast <4 x i64> %__B to <8 x i32> 7374 %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1) 7375 %3 = bitcast <8 x i32> %2 to <4 x i64> 7376 ret <4 x i64> %3 7377} 7378 7379define <4 x i64> @test_mm256_mask_rorv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7380; X86-LABEL: test_mm256_mask_rorv_epi32: 7381; X86: # %bb.0: # %entry 7382; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7383; X86-NEXT: kmovw %eax, %k1 7384; X86-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1} 7385; X86-NEXT: retl 7386; 7387; X64-LABEL: test_mm256_mask_rorv_epi32: 7388; X64: # %bb.0: # %entry 7389; X64-NEXT: kmovw %edi, %k1 7390; X64-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1} 7391; X64-NEXT: retq 7392entry: 7393 %0 = bitcast <4 x i64> %__A to <8 x i32> 7394 %1 = bitcast <4 x i64> %__B to <8 x i32> 7395 %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1) 7396 %3 = bitcast <4 x i64> %__W to <8 x i32> 7397 %4 = bitcast i8 %__U to <8 x i1> 7398 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3 7399 %6 = bitcast <8 x i32> %5 to <4 x i64> 7400 ret <4 x i64> %6 7401} 7402 7403define <4 x i64> @test_mm256_maskz_rorv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7404; X86-LABEL: test_mm256_maskz_rorv_epi32: 7405; X86: # %bb.0: # %entry 7406; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7407; X86-NEXT: kmovw %eax, %k1 7408; X86-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} 7409; X86-NEXT: retl 7410; 7411; X64-LABEL: test_mm256_maskz_rorv_epi32: 7412; X64: # %bb.0: # %entry 7413; X64-NEXT: kmovw %edi, %k1 7414; X64-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} 7415; X64-NEXT: retq 7416entry: 7417 %0 = bitcast <4 x i64> %__A to <8 x i32> 7418 %1 = bitcast <4 x i64> %__B to <8 x i32> 7419 %2 = tail call <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32> %0, <8 x i32> %1) 7420 %3 = bitcast i8 %__U to <8 x i1> 7421 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 7422 %5 = bitcast <8 x i32> %4 to <4 x i64> 7423 ret <4 x i64> %5 7424} 7425 7426define <2 x i64> @test_mm_rorv_epi64(<2 x i64> %__A, <2 x i64> %__B) { 7427; CHECK-LABEL: test_mm_rorv_epi64: 7428; CHECK: # %bb.0: # %entry 7429; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0 7430; CHECK-NEXT: ret{{[l|q]}} 7431entry: 7432 %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B) 7433 ret <2 x i64> %0 7434} 7435 7436define <2 x i64> @test_mm_mask_rorv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7437; X86-LABEL: test_mm_mask_rorv_epi64: 7438; X86: # %bb.0: # %entry 7439; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7440; X86-NEXT: kmovw %eax, %k1 7441; X86-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1} 7442; X86-NEXT: retl 7443; 7444; X64-LABEL: test_mm_mask_rorv_epi64: 7445; X64: # %bb.0: # %entry 7446; X64-NEXT: kmovw %edi, %k1 7447; X64-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1} 7448; X64-NEXT: retq 7449entry: 7450 %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B) 7451 %1 = bitcast i8 %__U to <8 x i1> 7452 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7453 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W 7454 ret <2 x i64> %2 7455} 7456 7457define <2 x i64> @test_mm_maskz_rorv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7458; X86-LABEL: test_mm_maskz_rorv_epi64: 7459; X86: # %bb.0: # %entry 7460; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7461; X86-NEXT: kmovw %eax, %k1 7462; X86-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7463; X86-NEXT: retl 7464; 7465; X64-LABEL: test_mm_maskz_rorv_epi64: 7466; X64: # %bb.0: # %entry 7467; X64-NEXT: kmovw %edi, %k1 7468; X64-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7469; X64-NEXT: retq 7470entry: 7471 %0 = tail call <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64> %__A, <2 x i64> %__B) 7472 %1 = bitcast i8 %__U to <8 x i1> 7473 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7474 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer 7475 ret <2 x i64> %2 7476} 7477 7478define <4 x i64> @test_mm256_rorv_epi64(<4 x i64> %__A, <4 x i64> %__B) { 7479; CHECK-LABEL: test_mm256_rorv_epi64: 7480; CHECK: # %bb.0: # %entry 7481; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0 7482; CHECK-NEXT: ret{{[l|q]}} 7483entry: 7484 %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7485 ret <4 x i64> %0 7486} 7487 7488define <4 x i64> @test_mm256_mask_rorv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7489; X86-LABEL: test_mm256_mask_rorv_epi64: 7490; X86: # %bb.0: # %entry 7491; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7492; X86-NEXT: kmovw %eax, %k1 7493; X86-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1} 7494; X86-NEXT: retl 7495; 7496; X64-LABEL: test_mm256_mask_rorv_epi64: 7497; X64: # %bb.0: # %entry 7498; X64-NEXT: kmovw %edi, %k1 7499; X64-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1} 7500; X64-NEXT: retq 7501entry: 7502 %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7503 %1 = bitcast i8 %__U to <8 x i1> 7504 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7505 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W 7506 ret <4 x i64> %2 7507} 7508 7509define <4 x i64> @test_mm256_maskz_rorv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7510; X86-LABEL: test_mm256_maskz_rorv_epi64: 7511; X86: # %bb.0: # %entry 7512; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7513; X86-NEXT: kmovw %eax, %k1 7514; X86-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7515; X86-NEXT: retl 7516; 7517; X64-LABEL: test_mm256_maskz_rorv_epi64: 7518; X64: # %bb.0: # %entry 7519; X64-NEXT: kmovw %edi, %k1 7520; X64-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7521; X64-NEXT: retq 7522entry: 7523 %0 = tail call <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64> %__A, <4 x i64> %__B) 7524 %1 = bitcast i8 %__U to <8 x i1> 7525 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7526 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer 7527 ret <4 x i64> %2 7528} 7529 7530declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) 7531declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) 7532declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8) 7533declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) 7534declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8) 7535declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) 7536declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8) 7537declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8) 7538declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) 7539declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) 7540declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8) 7541declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8) 7542declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8) 7543declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) 7544declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8) 7545declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8) 7546declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) 7547declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) 7548declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8) 7549declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8) 7550declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8) 7551declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>) 7552declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>) 7553declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>) 7554declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>) 7555declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>) 7556declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>) 7557declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>) 7558declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>) 7559declare <2 x double> @llvm.masked.expandload.v2f64(double*, <2 x i1>, <2 x double>) 7560declare <4 x double> @llvm.masked.expandload.v4f64(double*, <4 x i1>, <4 x double>) 7561declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>) 7562declare <4 x i64> @llvm.masked.expandload.v4i64(i64*, <4 x i1>, <4 x i64>) 7563declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>) 7564declare <8 x float> @llvm.masked.expandload.v8f32(float*, <8 x i1>, <8 x float>) 7565declare <4 x i32> @llvm.masked.expandload.v4i32(i32*, <4 x i1>, <4 x i32>) 7566declare <8 x i32> @llvm.masked.expandload.v8i32(i32*, <8 x i1>, <8 x i32>) 7567declare void @llvm.masked.compressstore.v2f64(<2 x double>, double*, <2 x i1>) 7568declare void @llvm.masked.compressstore.v4f64(<4 x double>, double*, <4 x i1>) 7569declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64*, <2 x i1>) 7570declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64*, <4 x i1>) 7571declare void @llvm.masked.compressstore.v4f32(<4 x float>, float*, <4 x i1>) 7572declare void @llvm.masked.compressstore.v8f32(<8 x float>, float*, <8 x i1>) 7573declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32*, <4 x i1>) 7574declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32*, <8 x i1>) 7575declare <4 x i32> @llvm.x86.avx512.prolv.d.128(<4 x i32>, <4 x i32>) 7576declare <8 x i32> @llvm.x86.avx512.prolv.d.256(<8 x i32>, <8 x i32>) 7577declare <2 x i64> @llvm.x86.avx512.prolv.q.128(<2 x i64>, <2 x i64>) 7578declare <4 x i64> @llvm.x86.avx512.prolv.q.256(<4 x i64>, <4 x i64>) 7579declare <4 x i32> @llvm.x86.avx512.prorv.d.128(<4 x i32>, <4 x i32>) 7580declare <8 x i32> @llvm.x86.avx512.prorv.d.256(<8 x i32>, <8 x i32>) 7581declare <2 x i64> @llvm.x86.avx512.prorv.q.128(<2 x i64>, <2 x i64>) 7582declare <4 x i64> @llvm.x86.avx512.prorv.q.256(<4 x i64>, <4 x i64>) 7583 7584!0 = !{i32 1} 7585