1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c 6 7define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { 8; X32-LABEL: test_mm_broadcastd_epi32: 9; X32: # BB#0: 10; X32-NEXT: vpbroadcastd %xmm0, %xmm0 11; X32-NEXT: retl 12; 13; X64-LABEL: test_mm_broadcastd_epi32: 14; X64: # BB#0: 15; X64-NEXT: vpbroadcastd %xmm0, %xmm0 16; X64-NEXT: retq 17 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 18 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer 19 %res1 = bitcast <4 x i32> %res0 to <2 x i64> 20 ret <2 x i64> %res1 21} 22 23define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) { 24; X32-LABEL: test_mm_mask_broadcastd_epi32: 25; X32: # BB#0: 26; X32-NEXT: pushl %eax 27; X32-NEXT: .Ltmp0: 28; X32-NEXT: .cfi_def_cfa_offset 8 29; X32-NEXT: movb {{[0-9]+}}(%esp), %al 30; X32-NEXT: andb $15, %al 31; X32-NEXT: movb %al, (%esp) 32; X32-NEXT: movzbl (%esp), %eax 33; X32-NEXT: kmovw %eax, %k1 34; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 35; X32-NEXT: popl %eax 36; X32-NEXT: retl 37; 38; X64-LABEL: test_mm_mask_broadcastd_epi32: 39; X64: # BB#0: 40; X64-NEXT: andb $15, %dil 41; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 42; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 43; X64-NEXT: kmovw %eax, %k1 44; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 45; X64-NEXT: retq 46 %trn1 = trunc i8 %a1 to i4 47 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 48 %arg1 = bitcast i4 %trn1 to <4 x i1> 49 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 50 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <4 x i32> zeroinitializer 51 %res1 = select <4 x i1> %arg1, <4 x i32> %res0, <4 x i32> %arg0 52 %res2 = bitcast <4 x i32> %res1 to <2 x i64> 53 ret <2 x i64> %res2 54} 55 56define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) { 57; X32-LABEL: test_mm_maskz_broadcastd_epi32: 58; X32: # BB#0: 59; X32-NEXT: pushl %eax 60; X32-NEXT: .Ltmp1: 61; X32-NEXT: .cfi_def_cfa_offset 8 62; X32-NEXT: movb {{[0-9]+}}(%esp), %al 63; X32-NEXT: andb $15, %al 64; X32-NEXT: movb %al, (%esp) 65; X32-NEXT: movzbl (%esp), %eax 66; X32-NEXT: kmovw %eax, %k1 67; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} 68; X32-NEXT: popl %eax 69; X32-NEXT: retl 70; 71; X64-LABEL: test_mm_maskz_broadcastd_epi32: 72; X64: # BB#0: 73; X64-NEXT: andb $15, %dil 74; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 75; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 76; X64-NEXT: kmovw %eax, %k1 77; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} 78; X64-NEXT: retq 79 %trn0 = trunc i8 %a0 to i4 80 %arg0 = bitcast i4 %trn0 to <4 x i1> 81 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 82 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <4 x i32> zeroinitializer 83 %res1 = select <4 x i1> %arg0, <4 x i32> %res0, <4 x i32> zeroinitializer 84 %res2 = bitcast <4 x i32> %res1 to <2 x i64> 85 ret <2 x i64> %res2 86} 87 88define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) { 89; X32-LABEL: test_mm256_broadcastd_epi32: 90; X32: # BB#0: 91; X32-NEXT: vpbroadcastd %xmm0, %ymm0 92; X32-NEXT: retl 93; 94; X64-LABEL: test_mm256_broadcastd_epi32: 95; X64: # BB#0: 96; X64-NEXT: vpbroadcastd %xmm0, %ymm0 97; X64-NEXT: retq 98 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 99 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer 100 %res1 = bitcast <8 x i32> %res0 to <4 x i64> 101 ret <4 x i64> %res1 102} 103 104define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) { 105; X32-LABEL: test_mm256_mask_broadcastd_epi32: 106; X32: # BB#0: 107; X32-NEXT: movb {{[0-9]+}}(%esp), %al 108; X32-NEXT: kmovw %eax, %k1 109; X32-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1} 110; X32-NEXT: retl 111; 112; X64-LABEL: test_mm256_mask_broadcastd_epi32: 113; X64: # BB#0: 114; X64-NEXT: kmovw %edi, %k1 115; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1} 116; X64-NEXT: retq 117 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 118 %arg1 = bitcast i8 %a1 to <8 x i1> 119 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 120 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer 121 %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0 122 %res2 = bitcast <8 x i32> %res1 to <4 x i64> 123 ret <4 x i64> %res2 124} 125 126define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) { 127; X32-LABEL: test_mm256_maskz_broadcastd_epi32: 128; X32: # BB#0: 129; X32-NEXT: movb {{[0-9]+}}(%esp), %al 130; X32-NEXT: kmovw %eax, %k1 131; X32-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} 132; X32-NEXT: retl 133; 134; X64-LABEL: test_mm256_maskz_broadcastd_epi32: 135; X64: # BB#0: 136; X64-NEXT: kmovw %edi, %k1 137; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} 138; X64-NEXT: retq 139 %arg0 = bitcast i8 %a0 to <8 x i1> 140 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 141 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer 142 %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer 143 %res2 = bitcast <8 x i32> %res1 to <4 x i64> 144 ret <4 x i64> %res2 145} 146 147define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { 148; X32-LABEL: test_mm_broadcastq_epi64: 149; X32: # BB#0: 150; X32-NEXT: vpbroadcastq %xmm0, %xmm0 151; X32-NEXT: retl 152; 153; X64-LABEL: test_mm_broadcastq_epi64: 154; X64: # BB#0: 155; X64-NEXT: vpbroadcastq %xmm0, %xmm0 156; X64-NEXT: retq 157 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 158 ret <2 x i64> %res 159} 160 161define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) { 162; X32-LABEL: test_mm_mask_broadcastq_epi64: 163; X32: # BB#0: 164; X32-NEXT: pushl %eax 165; X32-NEXT: .Ltmp2: 166; X32-NEXT: .cfi_def_cfa_offset 8 167; X32-NEXT: movb {{[0-9]+}}(%esp), %al 168; X32-NEXT: andb $3, %al 169; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 170; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 171; X32-NEXT: kmovw %eax, %k1 172; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 173; X32-NEXT: popl %eax 174; X32-NEXT: retl 175; 176; X64-LABEL: test_mm_mask_broadcastq_epi64: 177; X64: # BB#0: 178; X64-NEXT: andb $3, %dil 179; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 180; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 181; X64-NEXT: kmovw %eax, %k1 182; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 183; X64-NEXT: retq 184 %trn1 = trunc i8 %a1 to i2 185 %arg1 = bitcast i2 %trn1 to <2 x i1> 186 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <2 x i32> zeroinitializer 187 %res1 = select <2 x i1> %arg1, <2 x i64> %res0, <2 x i64> %a0 188 ret <2 x i64> %res1 189} 190 191define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { 192; X32-LABEL: test_mm_maskz_broadcastq_epi64: 193; X32: # BB#0: 194; X32-NEXT: pushl %eax 195; X32-NEXT: .Ltmp3: 196; X32-NEXT: .cfi_def_cfa_offset 8 197; X32-NEXT: movb {{[0-9]+}}(%esp), %al 198; X32-NEXT: andb $3, %al 199; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 200; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 201; X32-NEXT: kmovw %eax, %k1 202; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 203; X32-NEXT: popl %eax 204; X32-NEXT: retl 205; 206; X64-LABEL: test_mm_maskz_broadcastq_epi64: 207; X64: # BB#0: 208; X64-NEXT: andb $3, %dil 209; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 210; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 211; X64-NEXT: kmovw %eax, %k1 212; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 213; X64-NEXT: retq 214 %trn0 = trunc i8 %a0 to i2 215 %arg0 = bitcast i2 %trn0 to <2 x i1> 216 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> zeroinitializer 217 %res1 = select <2 x i1> %arg0, <2 x i64> %res0, <2 x i64> zeroinitializer 218 ret <2 x i64> %res1 219} 220 221define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) { 222; X32-LABEL: test_mm256_broadcastq_epi64: 223; X32: # BB#0: 224; X32-NEXT: vpbroadcastq %xmm0, %ymm0 225; X32-NEXT: retl 226; 227; X64-LABEL: test_mm256_broadcastq_epi64: 228; X64: # BB#0: 229; X64-NEXT: vpbroadcastq %xmm0, %ymm0 230; X64-NEXT: retq 231 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer 232 ret <4 x i64> %res 233} 234 235define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) { 236; X32-LABEL: test_mm256_mask_broadcastq_epi64: 237; X32: # BB#0: 238; X32-NEXT: pushl %eax 239; X32-NEXT: .Ltmp4: 240; X32-NEXT: .cfi_def_cfa_offset 8 241; X32-NEXT: movb {{[0-9]+}}(%esp), %al 242; X32-NEXT: andb $15, %al 243; X32-NEXT: movb %al, (%esp) 244; X32-NEXT: movzbl (%esp), %eax 245; X32-NEXT: kmovw %eax, %k1 246; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 247; X32-NEXT: popl %eax 248; X32-NEXT: retl 249; 250; X64-LABEL: test_mm256_mask_broadcastq_epi64: 251; X64: # BB#0: 252; X64-NEXT: andb $15, %dil 253; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 254; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 255; X64-NEXT: kmovw %eax, %k1 256; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 257; X64-NEXT: retq 258 %trn1 = trunc i8 %a1 to i4 259 %arg1 = bitcast i4 %trn1 to <4 x i1> 260 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> zeroinitializer 261 %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0 262 ret <4 x i64> %res1 263} 264 265define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { 266; X32-LABEL: test_mm256_maskz_broadcastq_epi64: 267; X32: # BB#0: 268; X32-NEXT: pushl %eax 269; X32-NEXT: .Ltmp5: 270; X32-NEXT: .cfi_def_cfa_offset 8 271; X32-NEXT: movb {{[0-9]+}}(%esp), %al 272; X32-NEXT: andb $15, %al 273; X32-NEXT: movb %al, (%esp) 274; X32-NEXT: movzbl (%esp), %eax 275; X32-NEXT: kmovw %eax, %k1 276; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 277; X32-NEXT: popl %eax 278; X32-NEXT: retl 279; 280; X64-LABEL: test_mm256_maskz_broadcastq_epi64: 281; X64: # BB#0: 282; X64-NEXT: andb $15, %dil 283; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 284; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 285; X64-NEXT: kmovw %eax, %k1 286; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 287; X64-NEXT: retq 288 %trn0 = trunc i8 %a0 to i4 289 %arg0 = bitcast i4 %trn0 to <4 x i1> 290 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <4 x i32> zeroinitializer 291 %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer 292 ret <4 x i64> %res1 293} 294 295define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) { 296; X32-LABEL: test_mm_broadcastsd_pd: 297; X32: # BB#0: 298; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 299; X32-NEXT: retl 300; 301; X64-LABEL: test_mm_broadcastsd_pd: 302; X64: # BB#0: 303; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 304; X64-NEXT: retq 305 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 306 ret <2 x double> %res 307} 308 309define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) { 310; X32-LABEL: test_mm_mask_broadcastsd_pd: 311; X32: # BB#0: 312; X32-NEXT: pushl %eax 313; X32-NEXT: .Ltmp6: 314; X32-NEXT: .cfi_def_cfa_offset 8 315; X32-NEXT: movb {{[0-9]+}}(%esp), %al 316; X32-NEXT: andb $3, %al 317; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 318; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 319; X32-NEXT: kmovw %eax, %k1 320; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 321; X32-NEXT: popl %eax 322; X32-NEXT: retl 323; 324; X64-LABEL: test_mm_mask_broadcastsd_pd: 325; X64: # BB#0: 326; X64-NEXT: andb $3, %dil 327; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 328; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 329; X64-NEXT: kmovw %eax, %k1 330; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 331; X64-NEXT: retq 332 %trn1 = trunc i8 %a1 to i2 333 %arg1 = bitcast i2 %trn1 to <2 x i1> 334 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer 335 %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0 336 ret <2 x double> %res1 337} 338 339define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { 340; X32-LABEL: test_mm_maskz_broadcastsd_pd: 341; X32: # BB#0: 342; X32-NEXT: pushl %eax 343; X32-NEXT: .Ltmp7: 344; X32-NEXT: .cfi_def_cfa_offset 8 345; X32-NEXT: movb {{[0-9]+}}(%esp), %al 346; X32-NEXT: andb $3, %al 347; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 348; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 349; X32-NEXT: kmovw %eax, %k1 350; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 351; X32-NEXT: popl %eax 352; X32-NEXT: retl 353; 354; X64-LABEL: test_mm_maskz_broadcastsd_pd: 355; X64: # BB#0: 356; X64-NEXT: andb $3, %dil 357; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 358; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 359; X64-NEXT: kmovw %eax, %k1 360; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 361; X64-NEXT: retq 362 %trn0 = trunc i8 %a0 to i2 363 %arg0 = bitcast i2 %trn0 to <2 x i1> 364 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer 365 %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer 366 ret <2 x double> %res1 367} 368 369define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) { 370; X32-LABEL: test_mm256_broadcastsd_pd: 371; X32: # BB#0: 372; X32-NEXT: vbroadcastsd %xmm0, %ymm0 373; X32-NEXT: retl 374; 375; X64-LABEL: test_mm256_broadcastsd_pd: 376; X64: # BB#0: 377; X64-NEXT: vbroadcastsd %xmm0, %ymm0 378; X64-NEXT: retq 379 %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer 380 ret <4 x double> %res 381} 382 383define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) { 384; X32-LABEL: test_mm256_mask_broadcastsd_pd: 385; X32: # BB#0: 386; X32-NEXT: pushl %eax 387; X32-NEXT: .Ltmp8: 388; X32-NEXT: .cfi_def_cfa_offset 8 389; X32-NEXT: movb {{[0-9]+}}(%esp), %al 390; X32-NEXT: andb $15, %al 391; X32-NEXT: movb %al, (%esp) 392; X32-NEXT: movzbl (%esp), %eax 393; X32-NEXT: kmovw %eax, %k1 394; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} 395; X32-NEXT: popl %eax 396; X32-NEXT: retl 397; 398; X64-LABEL: test_mm256_mask_broadcastsd_pd: 399; X64: # BB#0: 400; X64-NEXT: andb $15, %dil 401; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 402; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 403; X64-NEXT: kmovw %eax, %k1 404; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} 405; X64-NEXT: retq 406 %trn1 = trunc i8 %a1 to i4 407 %arg1 = bitcast i4 %trn1 to <4 x i1> 408 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <4 x i32> zeroinitializer 409 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0 410 ret <4 x double> %res1 411} 412 413define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { 414; X32-LABEL: test_mm256_maskz_broadcastsd_pd: 415; X32: # BB#0: 416; X32-NEXT: pushl %eax 417; X32-NEXT: .Ltmp9: 418; X32-NEXT: .cfi_def_cfa_offset 8 419; X32-NEXT: movb {{[0-9]+}}(%esp), %al 420; X32-NEXT: andb $15, %al 421; X32-NEXT: movb %al, (%esp) 422; X32-NEXT: movzbl (%esp), %eax 423; X32-NEXT: kmovw %eax, %k1 424; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} 425; X32-NEXT: popl %eax 426; X32-NEXT: retl 427; 428; X64-LABEL: test_mm256_maskz_broadcastsd_pd: 429; X64: # BB#0: 430; X64-NEXT: andb $15, %dil 431; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 432; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 433; X64-NEXT: kmovw %eax, %k1 434; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} 435; X64-NEXT: retq 436 %trn0 = trunc i8 %a0 to i4 437 %arg0 = bitcast i4 %trn0 to <4 x i1> 438 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <4 x i32> zeroinitializer 439 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer 440 ret <4 x double> %res1 441} 442 443define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { 444; X32-LABEL: test_mm_broadcastss_ps: 445; X32: # BB#0: 446; X32-NEXT: vbroadcastss %xmm0, %xmm0 447; X32-NEXT: retl 448; 449; X64-LABEL: test_mm_broadcastss_ps: 450; X64: # BB#0: 451; X64-NEXT: vbroadcastss %xmm0, %xmm0 452; X64-NEXT: retq 453 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 454 ret <4 x float> %res 455} 456 457define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { 458; X32-LABEL: test_mm_mask_broadcastss_ps: 459; X32: # BB#0: 460; X32-NEXT: pushl %eax 461; X32-NEXT: .Ltmp10: 462; X32-NEXT: .cfi_def_cfa_offset 8 463; X32-NEXT: movb {{[0-9]+}}(%esp), %al 464; X32-NEXT: andb $15, %al 465; X32-NEXT: movb %al, (%esp) 466; X32-NEXT: movzbl (%esp), %eax 467; X32-NEXT: kmovw %eax, %k1 468; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} 469; X32-NEXT: popl %eax 470; X32-NEXT: retl 471; 472; X64-LABEL: test_mm_mask_broadcastss_ps: 473; X64: # BB#0: 474; X64-NEXT: andb $15, %dil 475; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 476; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 477; X64-NEXT: kmovw %eax, %k1 478; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} 479; X64-NEXT: retq 480 %trn1 = trunc i8 %a1 to i4 481 %arg1 = bitcast i4 %trn1 to <4 x i1> 482 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> zeroinitializer 483 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0 484 ret <4 x float> %res1 485} 486 487define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) { 488; X32-LABEL: test_mm_maskz_broadcastss_ps: 489; X32: # BB#0: 490; X32-NEXT: pushl %eax 491; X32-NEXT: .Ltmp11: 492; X32-NEXT: .cfi_def_cfa_offset 8 493; X32-NEXT: movb {{[0-9]+}}(%esp), %al 494; X32-NEXT: andb $15, %al 495; X32-NEXT: movb %al, (%esp) 496; X32-NEXT: movzbl (%esp), %eax 497; X32-NEXT: kmovw %eax, %k1 498; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} 499; X32-NEXT: popl %eax 500; X32-NEXT: retl 501; 502; X64-LABEL: test_mm_maskz_broadcastss_ps: 503; X64: # BB#0: 504; X64-NEXT: andb $15, %dil 505; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 506; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 507; X64-NEXT: kmovw %eax, %k1 508; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} 509; X64-NEXT: retq 510 %trn0 = trunc i8 %a0 to i4 511 %arg0 = bitcast i4 %trn0 to <4 x i1> 512 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer 513 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer 514 ret <4 x float> %res1 515} 516 517define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) { 518; X32-LABEL: test_mm256_broadcastss_ps: 519; X32: # BB#0: 520; X32-NEXT: vbroadcastss %xmm0, %ymm0 521; X32-NEXT: retl 522; 523; X64-LABEL: test_mm256_broadcastss_ps: 524; X64: # BB#0: 525; X64-NEXT: vbroadcastss %xmm0, %ymm0 526; X64-NEXT: retq 527 %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer 528 ret <8 x float> %res 529} 530 531define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) { 532; X32-LABEL: test_mm256_mask_broadcastss_ps: 533; X32: # BB#0: 534; X32-NEXT: movb {{[0-9]+}}(%esp), %al 535; X32-NEXT: kmovw %eax, %k1 536; X32-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} 537; X32-NEXT: retl 538; 539; X64-LABEL: test_mm256_mask_broadcastss_ps: 540; X64: # BB#0: 541; X64-NEXT: kmovw %edi, %k1 542; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} 543; X64-NEXT: retq 544 %arg1 = bitcast i8 %a1 to <8 x i1> 545 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer 546 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 547 ret <8 x float> %res1 548} 549 550define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) { 551; X32-LABEL: test_mm256_maskz_broadcastss_ps: 552; X32: # BB#0: 553; X32-NEXT: movb {{[0-9]+}}(%esp), %al 554; X32-NEXT: kmovw %eax, %k1 555; X32-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} 556; X32-NEXT: retl 557; 558; X64-LABEL: test_mm256_maskz_broadcastss_ps: 559; X64: # BB#0: 560; X64-NEXT: kmovw %edi, %k1 561; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} 562; X64-NEXT: retq 563 %arg0 = bitcast i8 %a0 to <8 x i1> 564 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer 565 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 566 ret <8 x float> %res1 567} 568 569define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) { 570; X32-LABEL: test_mm_movddup_pd: 571; X32: # BB#0: 572; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 573; X32-NEXT: retl 574; 575; X64-LABEL: test_mm_movddup_pd: 576; X64: # BB#0: 577; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 578; X64-NEXT: retq 579 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 580 ret <2 x double> %res 581} 582 583define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) { 584; X32-LABEL: test_mm_mask_movddup_pd: 585; X32: # BB#0: 586; X32-NEXT: pushl %eax 587; X32-NEXT: .Ltmp12: 588; X32-NEXT: .cfi_def_cfa_offset 8 589; X32-NEXT: movb {{[0-9]+}}(%esp), %al 590; X32-NEXT: andb $3, %al 591; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 592; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 593; X32-NEXT: kmovw %eax, %k1 594; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 595; X32-NEXT: popl %eax 596; X32-NEXT: retl 597; 598; X64-LABEL: test_mm_mask_movddup_pd: 599; X64: # BB#0: 600; X64-NEXT: andb $3, %dil 601; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 602; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 603; X64-NEXT: kmovw %eax, %k1 604; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 605; X64-NEXT: retq 606 %trn1 = trunc i8 %a1 to i2 607 %arg1 = bitcast i2 %trn1 to <2 x i1> 608 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer 609 %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0 610 ret <2 x double> %res1 611} 612 613define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) { 614; X32-LABEL: test_mm_maskz_movddup_pd: 615; X32: # BB#0: 616; X32-NEXT: pushl %eax 617; X32-NEXT: .Ltmp13: 618; X32-NEXT: .cfi_def_cfa_offset 8 619; X32-NEXT: movb {{[0-9]+}}(%esp), %al 620; X32-NEXT: andb $3, %al 621; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 622; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 623; X32-NEXT: kmovw %eax, %k1 624; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 625; X32-NEXT: popl %eax 626; X32-NEXT: retl 627; 628; X64-LABEL: test_mm_maskz_movddup_pd: 629; X64: # BB#0: 630; X64-NEXT: andb $3, %dil 631; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 632; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 633; X64-NEXT: kmovw %eax, %k1 634; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 635; X64-NEXT: retq 636 %trn1 = trunc i8 %a0 to i2 637 %arg0 = bitcast i2 %trn1 to <2 x i1> 638 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer 639 %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer 640 ret <2 x double> %res1 641} 642 643define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) { 644; X32-LABEL: test_mm256_movddup_pd: 645; X32: # BB#0: 646; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 647; X32-NEXT: retl 648; 649; X64-LABEL: test_mm256_movddup_pd: 650; X64: # BB#0: 651; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 652; X64-NEXT: retq 653 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 654 ret <4 x double> %res 655} 656 657define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) { 658; X32-LABEL: test_mm256_mask_movddup_pd: 659; X32: # BB#0: 660; X32-NEXT: pushl %eax 661; X32-NEXT: .Ltmp14: 662; X32-NEXT: .cfi_def_cfa_offset 8 663; X32-NEXT: movb {{[0-9]+}}(%esp), %al 664; X32-NEXT: andb $15, %al 665; X32-NEXT: movb %al, (%esp) 666; X32-NEXT: movzbl (%esp), %eax 667; X32-NEXT: kmovw %eax, %k1 668; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] 669; X32-NEXT: popl %eax 670; X32-NEXT: retl 671; 672; X64-LABEL: test_mm256_mask_movddup_pd: 673; X64: # BB#0: 674; X64-NEXT: andb $15, %dil 675; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 676; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 677; X64-NEXT: kmovw %eax, %k1 678; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] 679; X64-NEXT: retq 680 %trn1 = trunc i8 %a1 to i4 681 %arg1 = bitcast i4 %trn1 to <4 x i1> 682 %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 683 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0 684 ret <4 x double> %res1 685} 686 687define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) { 688; X32-LABEL: test_mm256_maskz_movddup_pd: 689; X32: # BB#0: 690; X32-NEXT: pushl %eax 691; X32-NEXT: .Ltmp15: 692; X32-NEXT: .cfi_def_cfa_offset 8 693; X32-NEXT: movb {{[0-9]+}}(%esp), %al 694; X32-NEXT: andb $15, %al 695; X32-NEXT: movb %al, (%esp) 696; X32-NEXT: movzbl (%esp), %eax 697; X32-NEXT: kmovw %eax, %k1 698; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 699; X32-NEXT: popl %eax 700; X32-NEXT: retl 701; 702; X64-LABEL: test_mm256_maskz_movddup_pd: 703; X64: # BB#0: 704; X64-NEXT: andb $15, %dil 705; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 706; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 707; X64-NEXT: kmovw %eax, %k1 708; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 709; X64-NEXT: retq 710 %trn1 = trunc i8 %a0 to i4 711 %arg0 = bitcast i4 %trn1 to <4 x i1> 712 %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 713 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer 714 ret <4 x double> %res1 715} 716 717define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) { 718; X32-LABEL: test_mm_movehdup_ps: 719; X32: # BB#0: 720; X32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 721; X32-NEXT: retl 722; 723; X64-LABEL: test_mm_movehdup_ps: 724; X64: # BB#0: 725; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 726; X64-NEXT: retq 727 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 728 ret <4 x float> %res 729} 730 731define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { 732; X32-LABEL: test_mm_mask_movehdup_ps: 733; X32: # BB#0: 734; X32-NEXT: pushl %eax 735; X32-NEXT: .Ltmp16: 736; X32-NEXT: .cfi_def_cfa_offset 8 737; X32-NEXT: movb {{[0-9]+}}(%esp), %al 738; X32-NEXT: andb $15, %al 739; X32-NEXT: movb %al, (%esp) 740; X32-NEXT: movzbl (%esp), %eax 741; X32-NEXT: kmovw %eax, %k1 742; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] 743; X32-NEXT: popl %eax 744; X32-NEXT: retl 745; 746; X64-LABEL: test_mm_mask_movehdup_ps: 747; X64: # BB#0: 748; X64-NEXT: andb $15, %dil 749; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 750; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 751; X64-NEXT: kmovw %eax, %k1 752; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] 753; X64-NEXT: retq 754 %trn1 = trunc i8 %a1 to i4 755 %arg1 = bitcast i4 %trn1 to <4 x i1> 756 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 757 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0 758 ret <4 x float> %res1 759} 760 761define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) { 762; X32-LABEL: test_mm_maskz_movehdup_ps: 763; X32: # BB#0: 764; X32-NEXT: pushl %eax 765; X32-NEXT: .Ltmp17: 766; X32-NEXT: .cfi_def_cfa_offset 8 767; X32-NEXT: movb {{[0-9]+}}(%esp), %al 768; X32-NEXT: andb $15, %al 769; X32-NEXT: movb %al, (%esp) 770; X32-NEXT: movzbl (%esp), %eax 771; X32-NEXT: kmovw %eax, %k1 772; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 773; X32-NEXT: popl %eax 774; X32-NEXT: retl 775; 776; X64-LABEL: test_mm_maskz_movehdup_ps: 777; X64: # BB#0: 778; X64-NEXT: andb $15, %dil 779; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 780; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 781; X64-NEXT: kmovw %eax, %k1 782; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 783; X64-NEXT: retq 784 %trn0 = trunc i8 %a0 to i4 785 %arg0 = bitcast i4 %trn0 to <4 x i1> 786 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 787 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer 788 ret <4 x float> %res1 789} 790 791define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) { 792; X32-LABEL: test_mm256_movehdup_ps: 793; X32: # BB#0: 794; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 795; X32-NEXT: retl 796; 797; X64-LABEL: test_mm256_movehdup_ps: 798; X64: # BB#0: 799; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 800; X64-NEXT: retq 801 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 802 ret <8 x float> %res 803} 804 805define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) { 806; X32-LABEL: test_mm256_mask_movehdup_ps: 807; X32: # BB#0: 808; X32-NEXT: movb {{[0-9]+}}(%esp), %al 809; X32-NEXT: kmovw %eax, %k1 810; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7] 811; X32-NEXT: retl 812; 813; X64-LABEL: test_mm256_mask_movehdup_ps: 814; X64: # BB#0: 815; X64-NEXT: kmovw %edi, %k1 816; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7] 817; X64-NEXT: retq 818 %arg1 = bitcast i8 %a1 to <8 x i1> 819 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 820 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 821 ret <8 x float> %res1 822} 823 824define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) { 825; X32-LABEL: test_mm256_maskz_movehdup_ps: 826; X32: # BB#0: 827; X32-NEXT: movb {{[0-9]+}}(%esp), %al 828; X32-NEXT: kmovw %eax, %k1 829; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 830; X32-NEXT: retl 831; 832; X64-LABEL: test_mm256_maskz_movehdup_ps: 833; X64: # BB#0: 834; X64-NEXT: kmovw %edi, %k1 835; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 836; X64-NEXT: retq 837 %arg0 = bitcast i8 %a0 to <8 x i1> 838 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 839 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 840 ret <8 x float> %res1 841} 842 843define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) { 844; X32-LABEL: test_mm_moveldup_ps: 845; X32: # BB#0: 846; X32-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 847; X32-NEXT: retl 848; 849; X64-LABEL: test_mm_moveldup_ps: 850; X64: # BB#0: 851; X64-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 852; X64-NEXT: retq 853 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 854 ret <4 x float> %res 855} 856 857define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { 858; X32-LABEL: test_mm_mask_moveldup_ps: 859; X32: # BB#0: 860; X32-NEXT: pushl %eax 861; X32-NEXT: .Ltmp18: 862; X32-NEXT: .cfi_def_cfa_offset 8 863; X32-NEXT: movb {{[0-9]+}}(%esp), %al 864; X32-NEXT: andb $15, %al 865; X32-NEXT: movb %al, (%esp) 866; X32-NEXT: movzbl (%esp), %eax 867; X32-NEXT: kmovw %eax, %k1 868; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] 869; X32-NEXT: popl %eax 870; X32-NEXT: retl 871; 872; X64-LABEL: test_mm_mask_moveldup_ps: 873; X64: # BB#0: 874; X64-NEXT: andb $15, %dil 875; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 876; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 877; X64-NEXT: kmovw %eax, %k1 878; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] 879; X64-NEXT: retq 880 %trn1 = trunc i8 %a1 to i4 881 %arg1 = bitcast i4 %trn1 to <4 x i1> 882 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 883 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0 884 ret <4 x float> %res1 885} 886 887define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) { 888; X32-LABEL: test_mm_maskz_moveldup_ps: 889; X32: # BB#0: 890; X32-NEXT: pushl %eax 891; X32-NEXT: .Ltmp19: 892; X32-NEXT: .cfi_def_cfa_offset 8 893; X32-NEXT: movb {{[0-9]+}}(%esp), %al 894; X32-NEXT: andb $15, %al 895; X32-NEXT: movb %al, (%esp) 896; X32-NEXT: movzbl (%esp), %eax 897; X32-NEXT: kmovw %eax, %k1 898; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 899; X32-NEXT: popl %eax 900; X32-NEXT: retl 901; 902; X64-LABEL: test_mm_maskz_moveldup_ps: 903; X64: # BB#0: 904; X64-NEXT: andb $15, %dil 905; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 906; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 907; X64-NEXT: kmovw %eax, %k1 908; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 909; X64-NEXT: retq 910 %trn0 = trunc i8 %a0 to i4 911 %arg0 = bitcast i4 %trn0 to <4 x i1> 912 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 913 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer 914 ret <4 x float> %res1 915} 916 917define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) { 918; X32-LABEL: test_mm256_moveldup_ps: 919; X32: # BB#0: 920; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 921; X32-NEXT: retl 922; 923; X64-LABEL: test_mm256_moveldup_ps: 924; X64: # BB#0: 925; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 926; X64-NEXT: retq 927 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 928 ret <8 x float> %res 929} 930 931define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) { 932; X32-LABEL: test_mm256_mask_moveldup_ps: 933; X32: # BB#0: 934; X32-NEXT: movb {{[0-9]+}}(%esp), %al 935; X32-NEXT: kmovw %eax, %k1 936; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6] 937; X32-NEXT: retl 938; 939; X64-LABEL: test_mm256_mask_moveldup_ps: 940; X64: # BB#0: 941; X64-NEXT: kmovw %edi, %k1 942; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6] 943; X64-NEXT: retq 944 %arg1 = bitcast i8 %a1 to <8 x i1> 945 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 946 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 947 ret <8 x float> %res1 948} 949 950define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) { 951; X32-LABEL: test_mm256_maskz_moveldup_ps: 952; X32: # BB#0: 953; X32-NEXT: movb {{[0-9]+}}(%esp), %al 954; X32-NEXT: kmovw %eax, %k1 955; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 956; X32-NEXT: retl 957; 958; X64-LABEL: test_mm256_maskz_moveldup_ps: 959; X64: # BB#0: 960; X64-NEXT: kmovw %edi, %k1 961; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 962; X64-NEXT: retq 963 %arg0 = bitcast i8 %a0 to <8 x i1> 964 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 965 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 966 ret <8 x float> %res1 967} 968 969define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) { 970; X32-LABEL: test_mm256_permutex_epi64: 971; X32: # BB#0: 972; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] 973; X32-NEXT: retl 974; 975; X64-LABEL: test_mm256_permutex_epi64: 976; X64: # BB#0: 977; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] 978; X64-NEXT: retq 979 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 980 ret <4 x i64> %res 981} 982 983define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) { 984; X32-LABEL: test_mm256_mask_permutex_epi64: 985; X32: # BB#0: 986; X32-NEXT: pushl %eax 987; X32-NEXT: .Ltmp20: 988; X32-NEXT: .cfi_def_cfa_offset 8 989; X32-NEXT: movb {{[0-9]+}}(%esp), %al 990; X32-NEXT: andb $15, %al 991; X32-NEXT: movb %al, (%esp) 992; X32-NEXT: movzbl (%esp), %eax 993; X32-NEXT: kmovw %eax, %k1 994; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 995; X32-NEXT: popl %eax 996; X32-NEXT: retl 997; 998; X64-LABEL: test_mm256_mask_permutex_epi64: 999; X64: # BB#0: 1000; X64-NEXT: andb $15, %dil 1001; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1002; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1003; X64-NEXT: kmovw %eax, %k1 1004; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 1005; X64-NEXT: retq 1006 %trn1 = trunc i8 %a1 to i4 1007 %arg1 = bitcast i4 %trn1 to <4 x i1> 1008 %res0 = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1009 %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0 1010 ret <4 x i64> %res1 1011} 1012 1013define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) { 1014; X32-LABEL: test_mm256_maskz_permutex_epi64: 1015; X32: # BB#0: 1016; X32-NEXT: pushl %eax 1017; X32-NEXT: .Ltmp21: 1018; X32-NEXT: .cfi_def_cfa_offset 8 1019; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1020; X32-NEXT: andb $15, %al 1021; X32-NEXT: movb %al, (%esp) 1022; X32-NEXT: movzbl (%esp), %eax 1023; X32-NEXT: kmovw %eax, %k1 1024; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 1025; X32-NEXT: popl %eax 1026; X32-NEXT: retl 1027; 1028; X64-LABEL: test_mm256_maskz_permutex_epi64: 1029; X64: # BB#0: 1030; X64-NEXT: andb $15, %dil 1031; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1032; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1033; X64-NEXT: kmovw %eax, %k1 1034; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 1035; X64-NEXT: retq 1036 %trn1 = trunc i8 %a0 to i4 1037 %arg0 = bitcast i4 %trn1 to <4 x i1> 1038 %res0 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1039 %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer 1040 ret <4 x i64> %res1 1041} 1042 1043define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) { 1044; X32-LABEL: test_mm256_permutex_pd: 1045; X32: # BB#0: 1046; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] 1047; X32-NEXT: retl 1048; 1049; X64-LABEL: test_mm256_permutex_pd: 1050; X64: # BB#0: 1051; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] 1052; X64-NEXT: retq 1053 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 1054 ret <4 x double> %res 1055} 1056 1057define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) { 1058; X32-LABEL: test_mm256_mask_permutex_pd: 1059; X32: # BB#0: 1060; X32-NEXT: pushl %eax 1061; X32-NEXT: .Ltmp22: 1062; X32-NEXT: .cfi_def_cfa_offset 8 1063; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1064; X32-NEXT: andb $15, %al 1065; X32-NEXT: movb %al, (%esp) 1066; X32-NEXT: movzbl (%esp), %eax 1067; X32-NEXT: kmovw %eax, %k1 1068; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 1069; X32-NEXT: popl %eax 1070; X32-NEXT: retl 1071; 1072; X64-LABEL: test_mm256_mask_permutex_pd: 1073; X64: # BB#0: 1074; X64-NEXT: andb $15, %dil 1075; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1076; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1077; X64-NEXT: kmovw %eax, %k1 1078; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 1079; X64-NEXT: retq 1080 %trn1 = trunc i8 %a1 to i4 1081 %arg1 = bitcast i4 %trn1 to <4 x i1> 1082 %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1083 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0 1084 ret <4 x double> %res1 1085} 1086 1087define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) { 1088; X32-LABEL: test_mm256_maskz_permutex_pd: 1089; X32: # BB#0: 1090; X32-NEXT: pushl %eax 1091; X32-NEXT: .Ltmp23: 1092; X32-NEXT: .cfi_def_cfa_offset 8 1093; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1094; X32-NEXT: andb $15, %al 1095; X32-NEXT: movb %al, (%esp) 1096; X32-NEXT: movzbl (%esp), %eax 1097; X32-NEXT: kmovw %eax, %k1 1098; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 1099; X32-NEXT: popl %eax 1100; X32-NEXT: retl 1101; 1102; X64-LABEL: test_mm256_maskz_permutex_pd: 1103; X64: # BB#0: 1104; X64-NEXT: andb $15, %dil 1105; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1106; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1107; X64-NEXT: kmovw %eax, %k1 1108; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 1109; X64-NEXT: retq 1110 %trn1 = trunc i8 %a0 to i4 1111 %arg0 = bitcast i4 %trn1 to <4 x i1> 1112 %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1113 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer 1114 ret <4 x double> %res1 1115} 1116 1117define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) { 1118; X32-LABEL: test_mm_shuffle_pd: 1119; X32: # BB#0: 1120; X32-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1121; X32-NEXT: retl 1122; 1123; X64-LABEL: test_mm_shuffle_pd: 1124; X64: # BB#0: 1125; X64-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1126; X64-NEXT: retq 1127 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3> 1128 ret <2 x double> %res 1129} 1130 1131define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) { 1132; X32-LABEL: test_mm_mask_shuffle_pd: 1133; X32: # BB#0: 1134; X32-NEXT: pushl %eax 1135; X32-NEXT: .Ltmp24: 1136; X32-NEXT: .cfi_def_cfa_offset 8 1137; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1138; X32-NEXT: andb $3, %al 1139; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 1140; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1141; X32-NEXT: kmovw %eax, %k1 1142; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] 1143; X32-NEXT: popl %eax 1144; X32-NEXT: retl 1145; 1146; X64-LABEL: test_mm_mask_shuffle_pd: 1147; X64: # BB#0: 1148; X64-NEXT: andb $3, %dil 1149; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1150; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1151; X64-NEXT: kmovw %eax, %k1 1152; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] 1153; X64-NEXT: retq 1154 %trn1 = trunc i8 %a1 to i2 1155 %arg1 = bitcast i2 %trn1 to <2 x i1> 1156 %res0 = shufflevector <2 x double> %a2, <2 x double> %a3, <2 x i32> <i32 1, i32 3> 1157 %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0 1158 ret <2 x double> %res1 1159} 1160 1161define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) { 1162; X32-LABEL: test_mm_maskz_shuffle_pd: 1163; X32: # BB#0: 1164; X32-NEXT: pushl %eax 1165; X32-NEXT: .Ltmp25: 1166; X32-NEXT: .cfi_def_cfa_offset 8 1167; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1168; X32-NEXT: andb $3, %al 1169; X32-NEXT: movb %al, {{[0-9]+}}(%esp) 1170; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1171; X32-NEXT: kmovw %eax, %k1 1172; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] 1173; X32-NEXT: popl %eax 1174; X32-NEXT: retl 1175; 1176; X64-LABEL: test_mm_maskz_shuffle_pd: 1177; X64: # BB#0: 1178; X64-NEXT: andb $3, %dil 1179; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1180; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1181; X64-NEXT: kmovw %eax, %k1 1182; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] 1183; X64-NEXT: retq 1184 %trn1 = trunc i8 %a0 to i2 1185 %arg0 = bitcast i2 %trn1 to <2 x i1> 1186 %res0 = shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 1, i32 3> 1187 %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer 1188 ret <2 x double> %res1 1189} 1190 1191define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) { 1192; X32-LABEL: test_mm256_shuffle_pd: 1193; X32: # BB#0: 1194; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 1195; X32-NEXT: retl 1196; 1197; X64-LABEL: test_mm256_shuffle_pd: 1198; X64: # BB#0: 1199; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 1200; X64-NEXT: retq 1201 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 1202 ret <4 x double> %res 1203} 1204 1205define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) { 1206; X32-LABEL: test_mm256_mask_shuffle_pd: 1207; X32: # BB#0: 1208; X32-NEXT: pushl %eax 1209; X32-NEXT: .Ltmp26: 1210; X32-NEXT: .cfi_def_cfa_offset 8 1211; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1212; X32-NEXT: andb $15, %al 1213; X32-NEXT: movb %al, (%esp) 1214; X32-NEXT: movzbl (%esp), %eax 1215; X32-NEXT: kmovw %eax, %k1 1216; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] 1217; X32-NEXT: popl %eax 1218; X32-NEXT: retl 1219; 1220; X64-LABEL: test_mm256_mask_shuffle_pd: 1221; X64: # BB#0: 1222; X64-NEXT: andb $15, %dil 1223; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1224; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1225; X64-NEXT: kmovw %eax, %k1 1226; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] 1227; X64-NEXT: retq 1228 %trn1 = trunc i8 %a1 to i4 1229 %arg1 = bitcast i4 %trn1 to <4 x i1> 1230 %res0 = shufflevector <4 x double> %a2, <4 x double> %a3, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 1231 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0 1232 ret <4 x double> %res1 1233} 1234 1235define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) { 1236; X32-LABEL: test_mm256_maskz_shuffle_pd: 1237; X32: # BB#0: 1238; X32-NEXT: pushl %eax 1239; X32-NEXT: .Ltmp27: 1240; X32-NEXT: .cfi_def_cfa_offset 8 1241; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1242; X32-NEXT: andb $15, %al 1243; X32-NEXT: movb %al, (%esp) 1244; X32-NEXT: movzbl (%esp), %eax 1245; X32-NEXT: kmovw %eax, %k1 1246; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 1247; X32-NEXT: popl %eax 1248; X32-NEXT: retl 1249; 1250; X64-LABEL: test_mm256_maskz_shuffle_pd: 1251; X64: # BB#0: 1252; X64-NEXT: andb $15, %dil 1253; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1254; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1255; X64-NEXT: kmovw %eax, %k1 1256; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 1257; X64-NEXT: retq 1258 %trn1 = trunc i8 %a0 to i4 1259 %arg0 = bitcast i4 %trn1 to <4 x i1> 1260 %res0 = shufflevector <4 x double> %a1, <4 x double> %a2, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 1261 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer 1262 ret <4 x double> %res1 1263} 1264 1265define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) { 1266; X32-LABEL: test_mm_shuffle_ps: 1267; X32: # BB#0: 1268; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 1269; X32-NEXT: retl 1270; 1271; X64-LABEL: test_mm_shuffle_ps: 1272; X64: # BB#0: 1273; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 1274; X64-NEXT: retq 1275 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 1276 ret <4 x float> %res 1277} 1278 1279define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) { 1280; X32-LABEL: test_mm_mask_shuffle_ps: 1281; X32: # BB#0: 1282; X32-NEXT: pushl %eax 1283; X32-NEXT: .Ltmp28: 1284; X32-NEXT: .cfi_def_cfa_offset 8 1285; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1286; X32-NEXT: andb $15, %al 1287; X32-NEXT: movb %al, (%esp) 1288; X32-NEXT: movzbl (%esp), %eax 1289; X32-NEXT: kmovw %eax, %k1 1290; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] 1291; X32-NEXT: popl %eax 1292; X32-NEXT: retl 1293; 1294; X64-LABEL: test_mm_mask_shuffle_ps: 1295; X64: # BB#0: 1296; X64-NEXT: andb $15, %dil 1297; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1298; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1299; X64-NEXT: kmovw %eax, %k1 1300; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] 1301; X64-NEXT: retq 1302 %trn1 = trunc i8 %a1 to i4 1303 %arg1 = bitcast i4 %trn1 to <4 x i1> 1304 %res0 = shufflevector <4 x float> %a2, <4 x float> %a3, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 1305 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0 1306 ret <4 x float> %res1 1307} 1308 1309define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) { 1310; X32-LABEL: test_mm_maskz_shuffle_ps: 1311; X32: # BB#0: 1312; X32-NEXT: pushl %eax 1313; X32-NEXT: .Ltmp29: 1314; X32-NEXT: .cfi_def_cfa_offset 8 1315; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1316; X32-NEXT: andb $15, %al 1317; X32-NEXT: movb %al, (%esp) 1318; X32-NEXT: movzbl (%esp), %eax 1319; X32-NEXT: kmovw %eax, %k1 1320; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] 1321; X32-NEXT: popl %eax 1322; X32-NEXT: retl 1323; 1324; X64-LABEL: test_mm_maskz_shuffle_ps: 1325; X64: # BB#0: 1326; X64-NEXT: andb $15, %dil 1327; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) 1328; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 1329; X64-NEXT: kmovw %eax, %k1 1330; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] 1331; X64-NEXT: retq 1332 %trn0 = trunc i8 %a0 to i4 1333 %arg0 = bitcast i4 %trn0 to <4 x i1> 1334 %res0 = shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 1335 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer 1336 ret <4 x float> %res1 1337} 1338 1339define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) { 1340; X32-LABEL: test_mm256_shuffle_ps: 1341; X32: # BB#0: 1342; X32-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 1343; X32-NEXT: retl 1344; 1345; X64-LABEL: test_mm256_shuffle_ps: 1346; X64: # BB#0: 1347; X64-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 1348; X64-NEXT: retq 1349 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 1350 ret <8 x float> %res 1351} 1352 1353define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) { 1354; X32-LABEL: test_mm256_mask_shuffle_ps: 1355; X32: # BB#0: 1356; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1357; X32-NEXT: kmovw %eax, %k1 1358; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4] 1359; X32-NEXT: retl 1360; 1361; X64-LABEL: test_mm256_mask_shuffle_ps: 1362; X64: # BB#0: 1363; X64-NEXT: kmovw %edi, %k1 1364; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4] 1365; X64-NEXT: retq 1366 %arg1 = bitcast i8 %a1 to <8 x i1> 1367 %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 1368 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 1369 ret <8 x float> %res1 1370} 1371 1372define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) { 1373; X32-LABEL: test_mm256_maskz_shuffle_ps: 1374; X32: # BB#0: 1375; X32-NEXT: movb {{[0-9]+}}(%esp), %al 1376; X32-NEXT: kmovw %eax, %k1 1377; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 1378; X32-NEXT: retl 1379; 1380; X64-LABEL: test_mm256_maskz_shuffle_ps: 1381; X64: # BB#0: 1382; X64-NEXT: kmovw %edi, %k1 1383; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 1384; X64-NEXT: retq 1385 %arg0 = bitcast i8 %a0 to <8 x i1> 1386 %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 1387 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 1388 ret <8 x float> %res1 1389} 1390 1391!0 = !{i32 1} 1392