1; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32 2; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64 3 4@g16 = external global i16 5 6define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind { 7; X32-LABEL: pinsrd_1: 8; X32: ## BB#0: 9; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 10; X32-NEXT: retl 11; 12; X64-LABEL: pinsrd_1: 13; X64: ## BB#0: 14; X64-NEXT: pinsrd $1, %edi, %xmm0 15; X64-NEXT: retq 16 %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1 17 ret <4 x i32> %tmp1 18} 19 20define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind { 21; X32-LABEL: pinsrb_1: 22; X32: ## BB#0: 23; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 24; X32-NEXT: retl 25; 26; X64-LABEL: pinsrb_1: 27; X64: ## BB#0: 28; X64-NEXT: pinsrb $1, %edi, %xmm0 29; X64-NEXT: retq 30 %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1 31 ret <16 x i8> %tmp1 32} 33 34define <2 x i64> @pmovzxbq_1() nounwind { 35; X32-LABEL: pmovzxbq_1: 36; X32: ## BB#0: ## %entry 37; X32-NEXT: movl L_g16$non_lazy_ptr, %eax 38; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 39; X32-NEXT: retl 40; 41; X64-LABEL: pmovzxbq_1: 42; X64: ## BB#0: ## %entry 43; X64-NEXT: movq _g16@{{.*}}(%rip), %rax 44; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 45; X64-NEXT: retq 46entry: 47 %0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1] 48 %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1] 49 %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1] 50 %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1] 51 ret <2 x i64> %3 52} 53 54declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone 55 56define i32 @extractps_1(<4 x float> %v) nounwind { 57; X32-LABEL: extractps_1: 58; X32: ## BB#0: 59; X32-NEXT: extractps $3, %xmm0, %eax 60; X32-NEXT: retl 61; 62; X64-LABEL: extractps_1: 63; X64: ## BB#0: 64; X64-NEXT: extractps $3, %xmm0, %eax 65; X64-NEXT: retq 66 %s = extractelement <4 x float> %v, i32 3 67 %i = bitcast float %s to i32 68 ret i32 %i 69} 70define i32 @extractps_2(<4 x float> %v) nounwind { 71; X32-LABEL: extractps_2: 72; X32: ## BB#0: 73; X32-NEXT: extractps $3, %xmm0, %eax 74; X32-NEXT: retl 75; 76; X64-LABEL: extractps_2: 77; X64: ## BB#0: 78; X64-NEXT: extractps $3, %xmm0, %eax 79; X64-NEXT: retq 80 %t = bitcast <4 x float> %v to <4 x i32> 81 %s = extractelement <4 x i32> %t, i32 3 82 ret i32 %s 83} 84 85 86; The non-store form of extractps puts its result into a GPR. 87; This makes it suitable for an extract from a <4 x float> that 88; is bitcasted to i32, but unsuitable for much of anything else. 89 90define float @ext_1(<4 x float> %v) nounwind { 91; X32-LABEL: ext_1: 92; X32: ## BB#0: 93; X32-NEXT: pushl %eax 94; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 95; X32-NEXT: addss LCPI5_0, %xmm0 96; X32-NEXT: movss %xmm0, (%esp) 97; X32-NEXT: flds (%esp) 98; X32-NEXT: popl %eax 99; X32-NEXT: retl 100; 101; X64-LABEL: ext_1: 102; X64: ## BB#0: 103; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 104; X64-NEXT: addss {{.*}}(%rip), %xmm0 105; X64-NEXT: retq 106 %s = extractelement <4 x float> %v, i32 3 107 %t = fadd float %s, 1.0 108 ret float %t 109} 110define float @ext_2(<4 x float> %v) nounwind { 111; X32-LABEL: ext_2: 112; X32: ## BB#0: 113; X32-NEXT: pushl %eax 114; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 115; X32-NEXT: movss %xmm0, (%esp) 116; X32-NEXT: flds (%esp) 117; X32-NEXT: popl %eax 118; X32-NEXT: retl 119; 120; X64-LABEL: ext_2: 121; X64: ## BB#0: 122; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 123; X64-NEXT: retq 124 %s = extractelement <4 x float> %v, i32 3 125 ret float %s 126} 127define i32 @ext_3(<4 x i32> %v) nounwind { 128; X32-LABEL: ext_3: 129; X32: ## BB#0: 130; X32-NEXT: pextrd $3, %xmm0, %eax 131; X32-NEXT: retl 132; 133; X64-LABEL: ext_3: 134; X64: ## BB#0: 135; X64-NEXT: pextrd $3, %xmm0, %eax 136; X64-NEXT: retq 137 %i = extractelement <4 x i32> %v, i32 3 138 ret i32 %i 139} 140 141define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind { 142; X32-LABEL: insertps_1: 143; X32: ## BB#0: 144; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3] 145; X32-NEXT: retl 146; 147; X64-LABEL: insertps_1: 148; X64: ## BB#0: 149; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3] 150; X64-NEXT: retq 151 %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone 152 ret <4 x float> %tmp1 153} 154 155declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone 156 157; When optimizing for speed, prefer blendps over insertps even if it means we have to 158; generate a separate movss to load the scalar operand. 159define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind { 160; X32-LABEL: blendps_not_insertps_1: 161; X32: ## BB#0: 162; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 163; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 164; X32-NEXT: retl 165; 166; X64-LABEL: blendps_not_insertps_1: 167; X64: ## BB#0: 168; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 169; X64-NEXT: retq 170 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 171 ret <4 x float> %tmp1 172} 173 174; When optimizing for size, generate an insertps if there's a load fold opportunity. 175; The difference between i386 and x86-64 ABIs for the float operand means we should 176; generate an insertps for X32 but not for X64! 177define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind { 178; X32-LABEL: insertps_or_blendps: 179; X32: ## BB#0: 180; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] 181; X32-NEXT: retl 182; 183; X64-LABEL: insertps_or_blendps: 184; X64: ## BB#0: 185; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 186; X64-NEXT: retq 187 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 188 ret <4 x float> %tmp1 189} 190 191; An insert into the low 32-bits of a vector from the low 32-bits of another vector 192; is always just a blendps because blendps is never more expensive than insertps. 193define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind { 194; X32-LABEL: blendps_not_insertps_2: 195; X32: ## BB#0: 196; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 197; X32-NEXT: retl 198; 199; X64-LABEL: blendps_not_insertps_2: 200; X64: ## BB#0: 201; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 202; X64-NEXT: retq 203 %tmp2 = extractelement <4 x float> %t2, i32 0 204 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 205 ret <4 x float> %tmp1 206} 207 208define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind { 209; X32-LABEL: ptestz_1: 210; X32: ## BB#0: 211; X32-NEXT: ptest %xmm1, %xmm0 212; X32-NEXT: sete %al 213; X32-NEXT: movzbl %al, %eax 214; X32-NEXT: retl 215; 216; X64-LABEL: ptestz_1: 217; X64: ## BB#0: 218; X64-NEXT: ptest %xmm1, %xmm0 219; X64-NEXT: sete %al 220; X64-NEXT: movzbl %al, %eax 221; X64-NEXT: retq 222 %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 223 ret i32 %tmp1 224} 225 226define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind { 227; X32-LABEL: ptestz_2: 228; X32: ## BB#0: 229; X32-NEXT: ptest %xmm1, %xmm0 230; X32-NEXT: sbbl %eax, %eax 231; X32-NEXT: andl $1, %eax 232; X32-NEXT: retl 233; 234; X64-LABEL: ptestz_2: 235; X64: ## BB#0: 236; X64-NEXT: ptest %xmm1, %xmm0 237; X64-NEXT: sbbl %eax, %eax 238; X64-NEXT: andl $1, %eax 239; X64-NEXT: retq 240 %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 241 ret i32 %tmp1 242} 243 244define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind { 245; X32-LABEL: ptestz_3: 246; X32: ## BB#0: 247; X32-NEXT: ptest %xmm1, %xmm0 248; X32-NEXT: seta %al 249; X32-NEXT: movzbl %al, %eax 250; X32-NEXT: retl 251; 252; X64-LABEL: ptestz_3: 253; X64: ## BB#0: 254; X64-NEXT: ptest %xmm1, %xmm0 255; X64-NEXT: seta %al 256; X64-NEXT: movzbl %al, %eax 257; X64-NEXT: retq 258 %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 259 ret i32 %tmp1 260} 261 262 263declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone 264declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone 265declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone 266 267; This used to compile to insertps $0 + insertps $16. insertps $0 is always 268; pointless. 269define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { 270; X32-LABEL: buildvector: 271; X32: ## BB#0: ## %entry 272; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 273; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 274; X32-NEXT: addss %xmm1, %xmm0 275; X32-NEXT: addss %xmm2, %xmm3 276; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] 277; X32-NEXT: retl 278; 279; X64-LABEL: buildvector: 280; X64: ## BB#0: ## %entry 281; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 282; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 283; X64-NEXT: addss %xmm1, %xmm0 284; X64-NEXT: addss %xmm2, %xmm3 285; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] 286; X64-NEXT: retq 287entry: 288 %tmp7 = extractelement <2 x float> %A, i32 0 289 %tmp5 = extractelement <2 x float> %A, i32 1 290 %tmp3 = extractelement <2 x float> %B, i32 0 291 %tmp1 = extractelement <2 x float> %B, i32 1 292 %add.r = fadd float %tmp7, %tmp3 293 %add.i = fadd float %tmp5, %tmp1 294 %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0 295 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1 296 ret <2 x float> %tmp9 297} 298 299define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 300; X32-LABEL: insertps_from_shufflevector_1: 301; X32: ## BB#0: ## %entry 302; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 303; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 304; X32-NEXT: retl 305; 306; X64-LABEL: insertps_from_shufflevector_1: 307; X64: ## BB#0: ## %entry 308; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 309; X64-NEXT: retq 310entry: 311 %0 = load <4 x float>, <4 x float>* %pb, align 16 312 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 313 ret <4 x float> %vecinit6 314} 315 316define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { 317; X32-LABEL: insertps_from_shufflevector_2: 318; X32: ## BB#0: ## %entry 319; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 320; X32-NEXT: retl 321; 322; X64-LABEL: insertps_from_shufflevector_2: 323; X64: ## BB#0: ## %entry 324; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 325; X64-NEXT: retq 326entry: 327 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 328 ret <4 x float> %vecinit6 329} 330 331; For loading an i32 from memory into an xmm register we use pinsrd 332; instead of insertps 333define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) { 334; X32-LABEL: pinsrd_from_shufflevector_i32: 335; X32: ## BB#0: ## %entry 336; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 337; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0] 338; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 339; X32-NEXT: retl 340; 341; X64-LABEL: pinsrd_from_shufflevector_i32: 342; X64: ## BB#0: ## %entry 343; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0] 344; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 345; X64-NEXT: retq 346entry: 347 %0 = load <4 x i32>, <4 x i32>* %pb, align 16 348 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 349 ret <4 x i32> %vecinit6 350} 351 352define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { 353; X32-LABEL: insertps_from_shufflevector_i32_2: 354; X32: ## BB#0: ## %entry 355; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 356; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 357; X32-NEXT: retl 358; 359; X64-LABEL: insertps_from_shufflevector_i32_2: 360; X64: ## BB#0: ## %entry 361; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 362; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 363; X64-NEXT: retq 364entry: 365 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3> 366 ret <4 x i32> %vecinit6 367} 368 369define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) { 370; X32-LABEL: insertps_from_load_ins_elt_undef: 371; X32: ## BB#0: 372; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 373; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 374; X32-NEXT: retl 375; 376; X64-LABEL: insertps_from_load_ins_elt_undef: 377; X64: ## BB#0: 378; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 379; X64-NEXT: retq 380 %1 = load float, float* %b, align 4 381 %2 = insertelement <4 x float> undef, float %1, i32 0 382 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 383 ret <4 x float> %result 384} 385 386; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr 387define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { 388; X32-LABEL: insertps_from_load_ins_elt_undef_i32: 389; X32: ## BB#0: 390; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 391; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 392; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 393; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] 394; X32-NEXT: retl 395; 396; X64-LABEL: insertps_from_load_ins_elt_undef_i32: 397; X64: ## BB#0: 398; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 399; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 400; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] 401; X64-NEXT: retq 402 %1 = load i32, i32* %b, align 4 403 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 404 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 405 ret <4 x i32> %result 406} 407 408;;;;;; Shuffles optimizable with a single insertps or blend instruction 409define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { 410; X32-LABEL: shuf_XYZ0: 411; X32: ## BB#0: 412; X32-NEXT: xorps %xmm1, %xmm1 413; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 414; X32-NEXT: retl 415; 416; X64-LABEL: shuf_XYZ0: 417; X64: ## BB#0: 418; X64-NEXT: xorps %xmm1, %xmm1 419; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 420; X64-NEXT: retq 421 %vecext = extractelement <4 x float> %x, i32 0 422 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 423 %vecext1 = extractelement <4 x float> %x, i32 1 424 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 425 %vecext3 = extractelement <4 x float> %x, i32 2 426 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 427 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 428 ret <4 x float> %vecinit5 429} 430 431define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { 432; X32-LABEL: shuf_XY00: 433; X32: ## BB#0: 434; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 435; X32-NEXT: retl 436; 437; X64-LABEL: shuf_XY00: 438; X64: ## BB#0: 439; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 440; X64-NEXT: retq 441 %vecext = extractelement <4 x float> %x, i32 0 442 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 443 %vecext1 = extractelement <4 x float> %x, i32 1 444 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 445 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 446 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 447 ret <4 x float> %vecinit4 448} 449 450define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { 451; X32-LABEL: shuf_XYY0: 452; X32: ## BB#0: 453; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero 454; X32-NEXT: retl 455; 456; X64-LABEL: shuf_XYY0: 457; X64: ## BB#0: 458; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero 459; X64-NEXT: retq 460 %vecext = extractelement <4 x float> %x, i32 0 461 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 462 %vecext1 = extractelement <4 x float> %x, i32 1 463 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 464 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2 465 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 466 ret <4 x float> %vecinit5 467} 468 469define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { 470; X32-LABEL: shuf_XYW0: 471; X32: ## BB#0: 472; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero 473; X32-NEXT: retl 474; 475; X64-LABEL: shuf_XYW0: 476; X64: ## BB#0: 477; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero 478; X64-NEXT: retq 479 %vecext = extractelement <4 x float> %x, i32 0 480 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 481 %vecext1 = extractelement <4 x float> %x, i32 1 482 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 483 %vecext2 = extractelement <4 x float> %x, i32 3 484 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2 485 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 486 ret <4 x float> %vecinit4 487} 488 489define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { 490; X32-LABEL: shuf_W00W: 491; X32: ## BB#0: 492; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] 493; X32-NEXT: retl 494; 495; X64-LABEL: shuf_W00W: 496; X64: ## BB#0: 497; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] 498; X64-NEXT: retq 499 %vecext = extractelement <4 x float> %x, i32 3 500 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 501 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1 502 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 503 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3 504 ret <4 x float> %vecinit4 505} 506 507define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { 508; X32-LABEL: shuf_X00A: 509; X32: ## BB#0: 510; X32-NEXT: xorps %xmm2, %xmm2 511; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] 512; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 513; X32-NEXT: retl 514; 515; X64-LABEL: shuf_X00A: 516; X64: ## BB#0: 517; X64-NEXT: xorps %xmm2, %xmm2 518; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] 519; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 520; X64-NEXT: retq 521 %vecext = extractelement <4 x float> %x, i32 0 522 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 523 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 524 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 525 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 526 ret <4 x float> %vecinit4 527} 528 529define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { 530; X32-LABEL: shuf_X00X: 531; X32: ## BB#0: 532; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] 533; X32-NEXT: retl 534; 535; X64-LABEL: shuf_X00X: 536; X64: ## BB#0: 537; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] 538; X64-NEXT: retq 539 %vecext = extractelement <4 x float> %x, i32 0 540 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 541 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 542 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 543 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 544 ret <4 x float> %vecinit4 545} 546 547define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { 548; X32-LABEL: shuf_X0YC: 549; X32: ## BB#0: 550; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 551; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] 552; X32-NEXT: retl 553; 554; X64-LABEL: shuf_X0YC: 555; X64: ## BB#0: 556; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 557; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] 558; X64-NEXT: retq 559 %vecext = extractelement <4 x float> %x, i32 0 560 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 561 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 562 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 563 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 564 ret <4 x float> %vecinit5 565} 566 567define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { 568; X32-LABEL: i32_shuf_XYZ0: 569; X32: ## BB#0: 570; X32-NEXT: pxor %xmm1, %xmm1 571; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 572; X32-NEXT: retl 573; 574; X64-LABEL: i32_shuf_XYZ0: 575; X64: ## BB#0: 576; X64-NEXT: pxor %xmm1, %xmm1 577; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 578; X64-NEXT: retq 579 %vecext = extractelement <4 x i32> %x, i32 0 580 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 581 %vecext1 = extractelement <4 x i32> %x, i32 1 582 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 583 %vecext3 = extractelement <4 x i32> %x, i32 2 584 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 585 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 586 ret <4 x i32> %vecinit5 587} 588 589define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { 590; X32-LABEL: i32_shuf_XY00: 591; X32: ## BB#0: 592; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 593; X32-NEXT: retl 594; 595; X64-LABEL: i32_shuf_XY00: 596; X64: ## BB#0: 597; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 598; X64-NEXT: retq 599 %vecext = extractelement <4 x i32> %x, i32 0 600 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 601 %vecext1 = extractelement <4 x i32> %x, i32 1 602 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 603 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 604 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 605 ret <4 x i32> %vecinit4 606} 607 608define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { 609; X32-LABEL: i32_shuf_XYY0: 610; X32: ## BB#0: 611; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 612; X32-NEXT: pxor %xmm0, %xmm0 613; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 614; X32-NEXT: retl 615; 616; X64-LABEL: i32_shuf_XYY0: 617; X64: ## BB#0: 618; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 619; X64-NEXT: pxor %xmm0, %xmm0 620; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 621; X64-NEXT: retq 622 %vecext = extractelement <4 x i32> %x, i32 0 623 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 624 %vecext1 = extractelement <4 x i32> %x, i32 1 625 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 626 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2 627 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 628 ret <4 x i32> %vecinit5 629} 630 631define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { 632; X32-LABEL: i32_shuf_XYW0: 633; X32: ## BB#0: 634; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3] 635; X32-NEXT: pxor %xmm0, %xmm0 636; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 637; X32-NEXT: retl 638; 639; X64-LABEL: i32_shuf_XYW0: 640; X64: ## BB#0: 641; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3] 642; X64-NEXT: pxor %xmm0, %xmm0 643; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 644; X64-NEXT: retq 645 %vecext = extractelement <4 x i32> %x, i32 0 646 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 647 %vecext1 = extractelement <4 x i32> %x, i32 1 648 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 649 %vecext2 = extractelement <4 x i32> %x, i32 3 650 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2 651 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 652 ret <4 x i32> %vecinit4 653} 654 655define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { 656; X32-LABEL: i32_shuf_W00W: 657; X32: ## BB#0: 658; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 659; X32-NEXT: pxor %xmm0, %xmm0 660; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 661; X32-NEXT: retl 662; 663; X64-LABEL: i32_shuf_W00W: 664; X64: ## BB#0: 665; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 666; X64-NEXT: pxor %xmm0, %xmm0 667; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 668; X64-NEXT: retq 669 %vecext = extractelement <4 x i32> %x, i32 3 670 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 671 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 672 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 673 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3 674 ret <4 x i32> %vecinit4 675} 676 677define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { 678; X32-LABEL: i32_shuf_X00A: 679; X32: ## BB#0: 680; X32-NEXT: pxor %xmm2, %xmm2 681; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 682; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 683; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 684; X32-NEXT: retl 685; 686; X64-LABEL: i32_shuf_X00A: 687; X64: ## BB#0: 688; X64-NEXT: pxor %xmm2, %xmm2 689; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 690; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 691; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 692; X64-NEXT: retq 693 %vecext = extractelement <4 x i32> %x, i32 0 694 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 695 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 696 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 697 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 698 ret <4 x i32> %vecinit4 699} 700 701define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { 702; X32-LABEL: i32_shuf_X00X: 703; X32: ## BB#0: 704; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] 705; X32-NEXT: pxor %xmm0, %xmm0 706; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 707; X32-NEXT: retl 708; 709; X64-LABEL: i32_shuf_X00X: 710; X64: ## BB#0: 711; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] 712; X64-NEXT: pxor %xmm0, %xmm0 713; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 714; X64-NEXT: retq 715 %vecext = extractelement <4 x i32> %x, i32 0 716 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 717 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 718 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 719 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 720 ret <4 x i32> %vecinit4 721} 722 723define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { 724; X32-LABEL: i32_shuf_X0YC: 725; X32: ## BB#0: 726; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 727; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] 728; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] 729; X32-NEXT: retl 730; 731; X64-LABEL: i32_shuf_X0YC: 732; X64: ## BB#0: 733; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 734; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] 735; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] 736; X64-NEXT: retq 737 %vecext = extractelement <4 x i32> %x, i32 0 738 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 739 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 740 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 741 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 742 ret <4 x i32> %vecinit5 743} 744 745;; Test for a bug in the first implementation of LowerBuildVectorv4x32 746define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { 747; X32-LABEL: test_insertps_no_undef: 748; X32: ## BB#0: 749; X32-NEXT: xorps %xmm1, %xmm1 750; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] 751; X32-NEXT: maxps %xmm1, %xmm0 752; X32-NEXT: retl 753; 754; X64-LABEL: test_insertps_no_undef: 755; X64: ## BB#0: 756; X64-NEXT: xorps %xmm1, %xmm1 757; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] 758; X64-NEXT: maxps %xmm1, %xmm0 759; X64-NEXT: retq 760 %vecext = extractelement <4 x float> %x, i32 0 761 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 762 %vecext1 = extractelement <4 x float> %x, i32 1 763 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 764 %vecext3 = extractelement <4 x float> %x, i32 2 765 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 766 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 767 %mask = fcmp olt <4 x float> %vecinit5, %x 768 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 769 ret <4 x float> %res 770} 771 772define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { 773; X32-LABEL: blendvb_fallback: 774; X32: ## BB#0: 775; X32-NEXT: psllw $15, %xmm0 776; X32-NEXT: psraw $15, %xmm0 777; X32-NEXT: pblendvb %xmm1, %xmm2 778; X32-NEXT: movdqa %xmm2, %xmm0 779; X32-NEXT: retl 780; 781; X64-LABEL: blendvb_fallback: 782; X64: ## BB#0: 783; X64-NEXT: psllw $15, %xmm0 784; X64-NEXT: psraw $15, %xmm0 785; X64-NEXT: pblendvb %xmm1, %xmm2 786; X64-NEXT: movdqa %xmm2, %xmm0 787; X64-NEXT: retq 788 %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y 789 ret <8 x i16> %ret 790} 791 792; On X32, account for the argument's move to registers 793define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 794; X32-LABEL: insertps_from_vector_load: 795; X32: ## BB#0: 796; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 797; X32-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 798; X32-NEXT: retl 799; 800; X64-LABEL: insertps_from_vector_load: 801; X64: ## BB#0: 802; X64-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 803; X64-NEXT: retq 804 %1 = load <4 x float>, <4 x float>* %pb, align 16 805 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) 806 ret <4 x float> %2 807} 808 809;; Use a non-zero CountS for insertps 810;; Try to match a bit more of the instr, since we need the load's offset. 811define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 812; X32-LABEL: insertps_from_vector_load_offset: 813; X32: ## BB#0: 814; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 815; X32-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 816; X32-NEXT: retl 817; 818; X64-LABEL: insertps_from_vector_load_offset: 819; X64: ## BB#0: 820; X64-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 821; X64-NEXT: retq 822 %1 = load <4 x float>, <4 x float>* %pb, align 16 823 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) 824 ret <4 x float> %2 825} 826 827;; Try to match a bit more of the instr, since we need the load's offset. 828define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { 829; X32-LABEL: insertps_from_vector_load_offset_2: 830; X32: ## BB#0: 831; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 832; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 833; X32-NEXT: shll $4, %ecx 834; X32-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] 835; X32-NEXT: retl 836; 837; X64-LABEL: insertps_from_vector_load_offset_2: 838; X64: ## BB#0: 839; X64-NEXT: shlq $4, %rsi 840; X64-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] 841; X64-NEXT: retq 842 %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index 843 %2 = load <4 x float>, <4 x float>* %1, align 16 844 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) 845 ret <4 x float> %3 846} 847 848define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { 849; X32-LABEL: insertps_from_broadcast_loadf32: 850; X32: ## BB#0: 851; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 852; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 853; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 854; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 855; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 856; X32-NEXT: retl 857; 858; X64-LABEL: insertps_from_broadcast_loadf32: 859; X64: ## BB#0: 860; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 861; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 862; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 863; X64-NEXT: retq 864 %1 = getelementptr inbounds float, float* %fb, i64 %index 865 %2 = load float, float* %1, align 4 866 %3 = insertelement <4 x float> undef, float %2, i32 0 867 %4 = insertelement <4 x float> %3, float %2, i32 1 868 %5 = insertelement <4 x float> %4, float %2, i32 2 869 %6 = insertelement <4 x float> %5, float %2, i32 3 870 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 871 ret <4 x float> %7 872} 873 874define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { 875; X32-LABEL: insertps_from_broadcast_loadv4f32: 876; X32: ## BB#0: 877; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 878; X32-NEXT: movups (%eax), %xmm1 879; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 880; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 881; X32-NEXT: retl 882; 883; X64-LABEL: insertps_from_broadcast_loadv4f32: 884; X64: ## BB#0: 885; X64-NEXT: movups (%rdi), %xmm1 886; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 887; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 888; X64-NEXT: retq 889 %1 = load <4 x float>, <4 x float>* %b, align 4 890 %2 = extractelement <4 x float> %1, i32 0 891 %3 = insertelement <4 x float> undef, float %2, i32 0 892 %4 = insertelement <4 x float> %3, float %2, i32 1 893 %5 = insertelement <4 x float> %4, float %2, i32 2 894 %6 = insertelement <4 x float> %5, float %2, i32 3 895 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 896 ret <4 x float> %7 897} 898 899;; FIXME: We're emitting an extraneous pshufd/vbroadcast. 900define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { 901; X32-LABEL: insertps_from_broadcast_multiple_use: 902; X32: ## BB#0: 903; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 904; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 905; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero 906; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] 907; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] 908; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 909; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] 910; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 911; X32-NEXT: addps %xmm1, %xmm0 912; X32-NEXT: addps %xmm2, %xmm3 913; X32-NEXT: addps %xmm3, %xmm0 914; X32-NEXT: retl 915; 916; X64-LABEL: insertps_from_broadcast_multiple_use: 917; X64: ## BB#0: 918; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero 919; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] 920; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] 921; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 922; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] 923; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 924; X64-NEXT: addps %xmm1, %xmm0 925; X64-NEXT: addps %xmm2, %xmm3 926; X64-NEXT: addps %xmm3, %xmm0 927; X64-NEXT: retq 928 %1 = getelementptr inbounds float, float* %fb, i64 %index 929 %2 = load float, float* %1, align 4 930 %3 = insertelement <4 x float> undef, float %2, i32 0 931 %4 = insertelement <4 x float> %3, float %2, i32 1 932 %5 = insertelement <4 x float> %4, float %2, i32 2 933 %6 = insertelement <4 x float> %5, float %2, i32 3 934 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 935 %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) 936 %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) 937 %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) 938 %11 = fadd <4 x float> %7, %8 939 %12 = fadd <4 x float> %9, %10 940 %13 = fadd <4 x float> %11, %12 941 ret <4 x float> %13 942} 943 944define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { 945; X32-LABEL: insertps_with_undefs: 946; X32: ## BB#0: 947; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 948; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 949; X32-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 950; X32-NEXT: movapd %xmm1, %xmm0 951; X32-NEXT: retl 952; 953; X64-LABEL: insertps_with_undefs: 954; X64: ## BB#0: 955; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 956; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 957; X64-NEXT: movapd %xmm1, %xmm0 958; X64-NEXT: retq 959 %1 = load float, float* %b, align 4 960 %2 = insertelement <4 x float> undef, float %1, i32 0 961 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7> 962 ret <4 x float> %result 963} 964 965; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using 966; the destination index to change the load, instead of the source index. 967define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { 968; X32-LABEL: pr20087: 969; X32: ## BB#0: 970; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 971; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] 972; X32-NEXT: retl 973; 974; X64-LABEL: pr20087: 975; X64: ## BB#0: 976; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] 977; X64-NEXT: retq 978 %load = load <4 x float> , <4 x float> *%ptr 979 %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2> 980 ret <4 x float> %ret 981} 982 983; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> 984define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 { 985; X32-LABEL: insertps_pr20411: 986; X32: ## BB#0: 987; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 988; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 989; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 990; X32-NEXT: movdqu %xmm1, (%eax) 991; X32-NEXT: retl 992; 993; X64-LABEL: insertps_pr20411: 994; X64: ## BB#0: 995; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 996; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 997; X64-NEXT: movdqu %xmm1, (%rdi) 998; X64-NEXT: retq 999 %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef> 1000 %ptrcast = bitcast i32* %RET to <4 x i32>* 1001 store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 1002 ret void 1003} 1004 1005define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { 1006; X32-LABEL: insertps_4: 1007; X32: ## BB#0: ## %entry 1008; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero 1009; X32-NEXT: retl 1010; 1011; X64-LABEL: insertps_4: 1012; X64: ## BB#0: ## %entry 1013; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero 1014; X64-NEXT: retq 1015entry: 1016 %vecext = extractelement <4 x float> %A, i32 0 1017 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1018 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 1019 %vecext2 = extractelement <4 x float> %B, i32 2 1020 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 1021 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1022 ret <4 x float> %vecinit4 1023} 1024 1025define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) { 1026; X32-LABEL: insertps_5: 1027; X32: ## BB#0: ## %entry 1028; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero 1029; X32-NEXT: retl 1030; 1031; X64-LABEL: insertps_5: 1032; X64: ## BB#0: ## %entry 1033; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero 1034; X64-NEXT: retq 1035entry: 1036 %vecext = extractelement <4 x float> %A, i32 0 1037 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1038 %vecext1 = extractelement <4 x float> %B, i32 1 1039 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1040 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 1041 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1042 ret <4 x float> %vecinit4 1043} 1044 1045define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) { 1046; X32-LABEL: insertps_6: 1047; X32: ## BB#0: ## %entry 1048; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero 1049; X32-NEXT: retl 1050; 1051; X64-LABEL: insertps_6: 1052; X64: ## BB#0: ## %entry 1053; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero 1054; X64-NEXT: retq 1055entry: 1056 %vecext = extractelement <4 x float> %A, i32 1 1057 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 1058 %vecext1 = extractelement <4 x float> %B, i32 2 1059 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 1060 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 1061 ret <4 x float> %vecinit3 1062} 1063 1064define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) { 1065; X32-LABEL: insertps_7: 1066; X32: ## BB#0: ## %entry 1067; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero 1068; X32-NEXT: retl 1069; 1070; X64-LABEL: insertps_7: 1071; X64: ## BB#0: ## %entry 1072; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero 1073; X64-NEXT: retq 1074entry: 1075 %vecext = extractelement <4 x float> %A, i32 0 1076 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1077 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 1078 %vecext2 = extractelement <4 x float> %B, i32 1 1079 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 1080 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1081 ret <4 x float> %vecinit4 1082} 1083 1084define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) { 1085; X32-LABEL: insertps_8: 1086; X32: ## BB#0: ## %entry 1087; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero 1088; X32-NEXT: retl 1089; 1090; X64-LABEL: insertps_8: 1091; X64: ## BB#0: ## %entry 1092; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero 1093; X64-NEXT: retq 1094entry: 1095 %vecext = extractelement <4 x float> %A, i32 0 1096 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1097 %vecext1 = extractelement <4 x float> %B, i32 0 1098 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1099 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 1100 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1101 ret <4 x float> %vecinit4 1102} 1103 1104define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) { 1105; X32-LABEL: insertps_9: 1106; X32: ## BB#0: ## %entry 1107; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero 1108; X32-NEXT: movaps %xmm1, %xmm0 1109; X32-NEXT: retl 1110; 1111; X64-LABEL: insertps_9: 1112; X64: ## BB#0: ## %entry 1113; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero 1114; X64-NEXT: movaps %xmm1, %xmm0 1115; X64-NEXT: retq 1116entry: 1117 %vecext = extractelement <4 x float> %A, i32 0 1118 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 1119 %vecext1 = extractelement <4 x float> %B, i32 2 1120 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 1121 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 1122 ret <4 x float> %vecinit3 1123} 1124 1125define <4 x float> @insertps_10(<4 x float> %A) 1126; X32-LABEL: insertps_10: 1127; X32: ## BB#0: 1128; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero 1129; X32-NEXT: retl 1130; 1131; X64-LABEL: insertps_10: 1132; X64: ## BB#0: 1133; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero 1134; X64-NEXT: retq 1135{ 1136 %vecext = extractelement <4 x float> %A, i32 0 1137 %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0 1138 %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2 1139 ret <4 x float> %vecbuild2 1140} 1141 1142define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) { 1143; X32-LABEL: build_vector_to_shuffle_1: 1144; X32: ## BB#0: ## %entry 1145; X32-NEXT: xorps %xmm1, %xmm1 1146; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1147; X32-NEXT: retl 1148; 1149; X64-LABEL: build_vector_to_shuffle_1: 1150; X64: ## BB#0: ## %entry 1151; X64-NEXT: xorps %xmm1, %xmm1 1152; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1153; X64-NEXT: retq 1154entry: 1155 %vecext = extractelement <4 x float> %A, i32 1 1156 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 1157 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 1158 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1159 ret <4 x float> %vecinit3 1160} 1161 1162define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) { 1163; X32-LABEL: build_vector_to_shuffle_2: 1164; X32: ## BB#0: ## %entry 1165; X32-NEXT: xorps %xmm1, %xmm1 1166; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1167; X32-NEXT: retl 1168; 1169; X64-LABEL: build_vector_to_shuffle_2: 1170; X64: ## BB#0: ## %entry 1171; X64-NEXT: xorps %xmm1, %xmm1 1172; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1173; X64-NEXT: retq 1174entry: 1175 %vecext = extractelement <4 x float> %A, i32 1 1176 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 1177 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 1178 ret <4 x float> %vecinit1 1179} 1180