1; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32 2; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64 3 4@g16 = external global i16 5 6define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind { 7; X32-LABEL: pinsrd_1: 8; X32: ## BB#0: 9; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 10; X32-NEXT: retl 11; 12; X64-LABEL: pinsrd_1: 13; X64: ## BB#0: 14; X64-NEXT: pinsrd $1, %edi, %xmm0 15; X64-NEXT: retq 16 %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1 17 ret <4 x i32> %tmp1 18} 19 20define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind { 21; X32-LABEL: pinsrb_1: 22; X32: ## BB#0: 23; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 24; X32-NEXT: retl 25; 26; X64-LABEL: pinsrb_1: 27; X64: ## BB#0: 28; X64-NEXT: pinsrb $1, %edi, %xmm0 29; X64-NEXT: retq 30 %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1 31 ret <16 x i8> %tmp1 32} 33 34define <2 x i64> @pmovzxbq_1() nounwind { 35; X32-LABEL: pmovzxbq_1: 36; X32: ## BB#0: ## %entry 37; X32-NEXT: movl L_g16$non_lazy_ptr, %eax 38; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 39; X32-NEXT: retl 40; 41; X64-LABEL: pmovzxbq_1: 42; X64: ## BB#0: ## %entry 43; X64-NEXT: movq _g16@{{.*}}(%rip), %rax 44; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 45; X64-NEXT: retq 46entry: 47 %0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1] 48 %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1] 49 %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1] 50 %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1] 51 ret <2 x i64> %3 52} 53 54declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone 55 56define i32 @extractps_1(<4 x float> %v) nounwind { 57; X32-LABEL: extractps_1: 58; X32: ## BB#0: 59; X32-NEXT: extractps $3, %xmm0, %eax 60; X32-NEXT: retl 61; 62; X64-LABEL: extractps_1: 63; X64: ## BB#0: 64; X64-NEXT: extractps $3, %xmm0, %eax 65; X64-NEXT: retq 66 %s = extractelement <4 x float> %v, i32 3 67 %i = bitcast float %s to i32 68 ret i32 %i 69} 70define i32 @extractps_2(<4 x float> %v) nounwind { 71; X32-LABEL: extractps_2: 72; X32: ## BB#0: 73; X32-NEXT: extractps $3, %xmm0, %eax 74; X32-NEXT: retl 75; 76; X64-LABEL: extractps_2: 77; X64: ## BB#0: 78; X64-NEXT: extractps $3, %xmm0, %eax 79; X64-NEXT: retq 80 %t = bitcast <4 x float> %v to <4 x i32> 81 %s = extractelement <4 x i32> %t, i32 3 82 ret i32 %s 83} 84 85 86; The non-store form of extractps puts its result into a GPR. 87; This makes it suitable for an extract from a <4 x float> that 88; is bitcasted to i32, but unsuitable for much of anything else. 89 90define float @ext_1(<4 x float> %v) nounwind { 91; X32-LABEL: ext_1: 92; X32: ## BB#0: 93; X32-NEXT: pushl %eax 94; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 95; X32-NEXT: addss LCPI5_0, %xmm0 96; X32-NEXT: movss %xmm0, (%esp) 97; X32-NEXT: flds (%esp) 98; X32-NEXT: popl %eax 99; X32-NEXT: retl 100; 101; X64-LABEL: ext_1: 102; X64: ## BB#0: 103; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 104; X64-NEXT: addss {{.*}}(%rip), %xmm0 105; X64-NEXT: retq 106 %s = extractelement <4 x float> %v, i32 3 107 %t = fadd float %s, 1.0 108 ret float %t 109} 110define float @ext_2(<4 x float> %v) nounwind { 111; X32-LABEL: ext_2: 112; X32: ## BB#0: 113; X32-NEXT: pushl %eax 114; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 115; X32-NEXT: movss %xmm0, (%esp) 116; X32-NEXT: flds (%esp) 117; X32-NEXT: popl %eax 118; X32-NEXT: retl 119; 120; X64-LABEL: ext_2: 121; X64: ## BB#0: 122; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] 123; X64-NEXT: retq 124 %s = extractelement <4 x float> %v, i32 3 125 ret float %s 126} 127define i32 @ext_3(<4 x i32> %v) nounwind { 128; X32-LABEL: ext_3: 129; X32: ## BB#0: 130; X32-NEXT: pextrd $3, %xmm0, %eax 131; X32-NEXT: retl 132; 133; X64-LABEL: ext_3: 134; X64: ## BB#0: 135; X64-NEXT: pextrd $3, %xmm0, %eax 136; X64-NEXT: retq 137 %i = extractelement <4 x i32> %v, i32 3 138 ret i32 %i 139} 140 141define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind { 142; X32-LABEL: insertps_1: 143; X32: ## BB#0: 144; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3] 145; X32-NEXT: retl 146; 147; X64-LABEL: insertps_1: 148; X64: ## BB#0: 149; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3] 150; X64-NEXT: retq 151 %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone 152 ret <4 x float> %tmp1 153} 154 155declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone 156 157; When optimizing for speed, prefer blendps over insertps even if it means we have to 158; generate a separate movss to load the scalar operand. 159define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind { 160; X32-LABEL: blendps_not_insertps_1: 161; X32: ## BB#0: 162; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 163; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 164; X32-NEXT: retl 165; 166; X64-LABEL: blendps_not_insertps_1: 167; X64: ## BB#0: 168; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 169; X64-NEXT: retq 170 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 171 ret <4 x float> %tmp1 172} 173 174; When optimizing for size, generate an insertps if there's a load fold opportunity. 175; The difference between i386 and x86-64 ABIs for the float operand means we should 176; generate an insertps for X32 but not for X64! 177define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind { 178; X32-LABEL: insertps_or_blendps: 179; X32: ## BB#0: 180; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] 181; X32-NEXT: retl 182; 183; X64-LABEL: insertps_or_blendps: 184; X64: ## BB#0: 185; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 186; X64-NEXT: retq 187 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 188 ret <4 x float> %tmp1 189} 190 191; An insert into the low 32-bits of a vector from the low 32-bits of another vector 192; is always just a blendps because blendps is never more expensive than insertps. 193define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind { 194; X32-LABEL: blendps_not_insertps_2: 195; X32: ## BB#0: 196; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 197; X32-NEXT: retl 198; 199; X64-LABEL: blendps_not_insertps_2: 200; X64: ## BB#0: 201; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 202; X64-NEXT: retq 203 %tmp2 = extractelement <4 x float> %t2, i32 0 204 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 205 ret <4 x float> %tmp1 206} 207 208define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind { 209; X32-LABEL: ptestz_1: 210; X32: ## BB#0: 211; X32-NEXT: xorl %eax, %eax 212; X32-NEXT: ptest %xmm1, %xmm0 213; X32-NEXT: sete %al 214; X32-NEXT: retl 215; 216; X64-LABEL: ptestz_1: 217; X64: ## BB#0: 218; X64-NEXT: xorl %eax, %eax 219; X64-NEXT: ptest %xmm1, %xmm0 220; X64-NEXT: sete %al 221; X64-NEXT: retq 222 %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 223 ret i32 %tmp1 224} 225 226define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind { 227; X32-LABEL: ptestz_2: 228; X32: ## BB#0: 229; X32-NEXT: ptest %xmm1, %xmm0 230; X32-NEXT: sbbl %eax, %eax 231; X32-NEXT: andl $1, %eax 232; X32-NEXT: retl 233; 234; X64-LABEL: ptestz_2: 235; X64: ## BB#0: 236; X64-NEXT: ptest %xmm1, %xmm0 237; X64-NEXT: sbbl %eax, %eax 238; X64-NEXT: andl $1, %eax 239; X64-NEXT: retq 240 %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 241 ret i32 %tmp1 242} 243 244define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind { 245; X32-LABEL: ptestz_3: 246; X32: ## BB#0: 247; X32-NEXT: xorl %eax, %eax 248; X32-NEXT: ptest %xmm1, %xmm0 249; X32-NEXT: seta %al 250; X32-NEXT: retl 251; 252; X64-LABEL: ptestz_3: 253; X64: ## BB#0: 254; X64-NEXT: xorl %eax, %eax 255; X64-NEXT: ptest %xmm1, %xmm0 256; X64-NEXT: seta %al 257; X64-NEXT: retq 258 %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 259 ret i32 %tmp1 260} 261 262 263declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone 264declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone 265declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone 266 267; This used to compile to insertps $0 + insertps $16. insertps $0 is always 268; pointless. 269define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { 270; X32-LABEL: buildvector: 271; X32: ## BB#0: ## %entry 272; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 273; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 274; X32-NEXT: addss %xmm1, %xmm0 275; X32-NEXT: addss %xmm2, %xmm3 276; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] 277; X32-NEXT: retl 278; 279; X64-LABEL: buildvector: 280; X64: ## BB#0: ## %entry 281; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 282; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 283; X64-NEXT: addss %xmm1, %xmm0 284; X64-NEXT: addss %xmm2, %xmm3 285; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] 286; X64-NEXT: retq 287entry: 288 %tmp7 = extractelement <2 x float> %A, i32 0 289 %tmp5 = extractelement <2 x float> %A, i32 1 290 %tmp3 = extractelement <2 x float> %B, i32 0 291 %tmp1 = extractelement <2 x float> %B, i32 1 292 %add.r = fadd float %tmp7, %tmp3 293 %add.i = fadd float %tmp5, %tmp1 294 %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0 295 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1 296 ret <2 x float> %tmp9 297} 298 299define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 300; X32-LABEL: insertps_from_shufflevector_1: 301; X32: ## BB#0: ## %entry 302; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 303; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 304; X32-NEXT: retl 305; 306; X64-LABEL: insertps_from_shufflevector_1: 307; X64: ## BB#0: ## %entry 308; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 309; X64-NEXT: retq 310entry: 311 %0 = load <4 x float>, <4 x float>* %pb, align 16 312 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 313 ret <4 x float> %vecinit6 314} 315 316define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { 317; X32-LABEL: insertps_from_shufflevector_2: 318; X32: ## BB#0: ## %entry 319; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 320; X32-NEXT: retl 321; 322; X64-LABEL: insertps_from_shufflevector_2: 323; X64: ## BB#0: ## %entry 324; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 325; X64-NEXT: retq 326entry: 327 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 328 ret <4 x float> %vecinit6 329} 330 331; For loading an i32 from memory into an xmm register we use pinsrd 332; instead of insertps 333define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) { 334; X32-LABEL: pinsrd_from_shufflevector_i32: 335; X32: ## BB#0: ## %entry 336; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 337; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0] 338; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 339; X32-NEXT: retl 340; 341; X64-LABEL: pinsrd_from_shufflevector_i32: 342; X64: ## BB#0: ## %entry 343; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0] 344; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 345; X64-NEXT: retq 346entry: 347 %0 = load <4 x i32>, <4 x i32>* %pb, align 16 348 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 349 ret <4 x i32> %vecinit6 350} 351 352define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { 353; X32-LABEL: insertps_from_shufflevector_i32_2: 354; X32: ## BB#0: ## %entry 355; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 356; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 357; X32-NEXT: retl 358; 359; X64-LABEL: insertps_from_shufflevector_i32_2: 360; X64: ## BB#0: ## %entry 361; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 362; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 363; X64-NEXT: retq 364entry: 365 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3> 366 ret <4 x i32> %vecinit6 367} 368 369define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) { 370; X32-LABEL: insertps_from_load_ins_elt_undef: 371; X32: ## BB#0: 372; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 373; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 374; X32-NEXT: retl 375; 376; X64-LABEL: insertps_from_load_ins_elt_undef: 377; X64: ## BB#0: 378; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 379; X64-NEXT: retq 380 %1 = load float, float* %b, align 4 381 %2 = insertelement <4 x float> undef, float %1, i32 0 382 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 383 ret <4 x float> %result 384} 385 386; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr 387define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { 388; X32-LABEL: insertps_from_load_ins_elt_undef_i32: 389; X32: ## BB#0: 390; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 391; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 392; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 393; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] 394; X32-NEXT: retl 395; 396; X64-LABEL: insertps_from_load_ins_elt_undef_i32: 397; X64: ## BB#0: 398; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 399; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 400; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] 401; X64-NEXT: retq 402 %1 = load i32, i32* %b, align 4 403 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 404 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 405 ret <4 x i32> %result 406} 407 408;;;;;; Shuffles optimizable with a single insertps or blend instruction 409define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { 410; X32-LABEL: shuf_XYZ0: 411; X32: ## BB#0: 412; X32-NEXT: xorps %xmm1, %xmm1 413; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 414; X32-NEXT: retl 415; 416; X64-LABEL: shuf_XYZ0: 417; X64: ## BB#0: 418; X64-NEXT: xorps %xmm1, %xmm1 419; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 420; X64-NEXT: retq 421 %vecext = extractelement <4 x float> %x, i32 0 422 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 423 %vecext1 = extractelement <4 x float> %x, i32 1 424 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 425 %vecext3 = extractelement <4 x float> %x, i32 2 426 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 427 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 428 ret <4 x float> %vecinit5 429} 430 431define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { 432; X32-LABEL: shuf_XY00: 433; X32: ## BB#0: 434; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 435; X32-NEXT: retl 436; 437; X64-LABEL: shuf_XY00: 438; X64: ## BB#0: 439; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 440; X64-NEXT: retq 441 %vecext = extractelement <4 x float> %x, i32 0 442 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 443 %vecext1 = extractelement <4 x float> %x, i32 1 444 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 445 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 446 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 447 ret <4 x float> %vecinit4 448} 449 450define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { 451; X32-LABEL: shuf_XYY0: 452; X32: ## BB#0: 453; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero 454; X32-NEXT: retl 455; 456; X64-LABEL: shuf_XYY0: 457; X64: ## BB#0: 458; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero 459; X64-NEXT: retq 460 %vecext = extractelement <4 x float> %x, i32 0 461 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 462 %vecext1 = extractelement <4 x float> %x, i32 1 463 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 464 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2 465 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 466 ret <4 x float> %vecinit5 467} 468 469define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { 470; X32-LABEL: shuf_XYW0: 471; X32: ## BB#0: 472; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero 473; X32-NEXT: retl 474; 475; X64-LABEL: shuf_XYW0: 476; X64: ## BB#0: 477; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero 478; X64-NEXT: retq 479 %vecext = extractelement <4 x float> %x, i32 0 480 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 481 %vecext1 = extractelement <4 x float> %x, i32 1 482 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 483 %vecext2 = extractelement <4 x float> %x, i32 3 484 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2 485 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 486 ret <4 x float> %vecinit4 487} 488 489define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { 490; X32-LABEL: shuf_W00W: 491; X32: ## BB#0: 492; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] 493; X32-NEXT: retl 494; 495; X64-LABEL: shuf_W00W: 496; X64: ## BB#0: 497; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] 498; X64-NEXT: retq 499 %vecext = extractelement <4 x float> %x, i32 3 500 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 501 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1 502 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 503 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3 504 ret <4 x float> %vecinit4 505} 506 507define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { 508; X32-LABEL: shuf_X00A: 509; X32: ## BB#0: 510; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0] 511; X32-NEXT: retl 512; 513; X64-LABEL: shuf_X00A: 514; X64: ## BB#0: 515; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0] 516; X64-NEXT: retq 517 %vecext = extractelement <4 x float> %x, i32 0 518 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 519 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 520 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 521 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 522 ret <4 x float> %vecinit4 523} 524 525define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { 526; X32-LABEL: shuf_X00X: 527; X32: ## BB#0: 528; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] 529; X32-NEXT: retl 530; 531; X64-LABEL: shuf_X00X: 532; X64: ## BB#0: 533; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] 534; X64-NEXT: retq 535 %vecext = extractelement <4 x float> %x, i32 0 536 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 537 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 538 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 539 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 540 ret <4 x float> %vecinit4 541} 542 543define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { 544; X32-LABEL: shuf_X0YC: 545; X32: ## BB#0: 546; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 547; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] 548; X32-NEXT: retl 549; 550; X64-LABEL: shuf_X0YC: 551; X64: ## BB#0: 552; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 553; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] 554; X64-NEXT: retq 555 %vecext = extractelement <4 x float> %x, i32 0 556 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 557 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 558 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 559 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 560 ret <4 x float> %vecinit5 561} 562 563define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { 564; X32-LABEL: i32_shuf_XYZ0: 565; X32: ## BB#0: 566; X32-NEXT: pxor %xmm1, %xmm1 567; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 568; X32-NEXT: retl 569; 570; X64-LABEL: i32_shuf_XYZ0: 571; X64: ## BB#0: 572; X64-NEXT: pxor %xmm1, %xmm1 573; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 574; X64-NEXT: retq 575 %vecext = extractelement <4 x i32> %x, i32 0 576 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 577 %vecext1 = extractelement <4 x i32> %x, i32 1 578 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 579 %vecext3 = extractelement <4 x i32> %x, i32 2 580 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 581 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 582 ret <4 x i32> %vecinit5 583} 584 585define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { 586; X32-LABEL: i32_shuf_XY00: 587; X32: ## BB#0: 588; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 589; X32-NEXT: retl 590; 591; X64-LABEL: i32_shuf_XY00: 592; X64: ## BB#0: 593; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 594; X64-NEXT: retq 595 %vecext = extractelement <4 x i32> %x, i32 0 596 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 597 %vecext1 = extractelement <4 x i32> %x, i32 1 598 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 599 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 600 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 601 ret <4 x i32> %vecinit4 602} 603 604define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { 605; X32-LABEL: i32_shuf_XYY0: 606; X32: ## BB#0: 607; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 608; X32-NEXT: pxor %xmm0, %xmm0 609; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 610; X32-NEXT: retl 611; 612; X64-LABEL: i32_shuf_XYY0: 613; X64: ## BB#0: 614; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 615; X64-NEXT: pxor %xmm0, %xmm0 616; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 617; X64-NEXT: retq 618 %vecext = extractelement <4 x i32> %x, i32 0 619 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 620 %vecext1 = extractelement <4 x i32> %x, i32 1 621 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 622 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2 623 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 624 ret <4 x i32> %vecinit5 625} 626 627define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { 628; X32-LABEL: i32_shuf_XYW0: 629; X32: ## BB#0: 630; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3] 631; X32-NEXT: pxor %xmm0, %xmm0 632; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 633; X32-NEXT: retl 634; 635; X64-LABEL: i32_shuf_XYW0: 636; X64: ## BB#0: 637; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3] 638; X64-NEXT: pxor %xmm0, %xmm0 639; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 640; X64-NEXT: retq 641 %vecext = extractelement <4 x i32> %x, i32 0 642 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 643 %vecext1 = extractelement <4 x i32> %x, i32 1 644 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 645 %vecext2 = extractelement <4 x i32> %x, i32 3 646 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2 647 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 648 ret <4 x i32> %vecinit4 649} 650 651define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { 652; X32-LABEL: i32_shuf_W00W: 653; X32: ## BB#0: 654; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 655; X32-NEXT: pxor %xmm0, %xmm0 656; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 657; X32-NEXT: retl 658; 659; X64-LABEL: i32_shuf_W00W: 660; X64: ## BB#0: 661; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 662; X64-NEXT: pxor %xmm0, %xmm0 663; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 664; X64-NEXT: retq 665 %vecext = extractelement <4 x i32> %x, i32 3 666 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 667 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 668 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 669 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3 670 ret <4 x i32> %vecinit4 671} 672 673define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { 674; X32-LABEL: i32_shuf_X00A: 675; X32: ## BB#0: 676; X32-NEXT: pxor %xmm2, %xmm2 677; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 678; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 679; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 680; X32-NEXT: retl 681; 682; X64-LABEL: i32_shuf_X00A: 683; X64: ## BB#0: 684; X64-NEXT: pxor %xmm2, %xmm2 685; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 686; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 687; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 688; X64-NEXT: retq 689 %vecext = extractelement <4 x i32> %x, i32 0 690 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 691 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 692 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 693 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 694 ret <4 x i32> %vecinit4 695} 696 697define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { 698; X32-LABEL: i32_shuf_X00X: 699; X32: ## BB#0: 700; X32-NEXT: pxor %xmm1, %xmm1 701; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 702; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] 703; X32-NEXT: retl 704; 705; X64-LABEL: i32_shuf_X00X: 706; X64: ## BB#0: 707; X64-NEXT: pxor %xmm1, %xmm1 708; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 709; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] 710; X64-NEXT: retq 711 %vecext = extractelement <4 x i32> %x, i32 0 712 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 713 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 714 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 715 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 716 ret <4 x i32> %vecinit4 717} 718 719define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { 720; X32-LABEL: i32_shuf_X0YC: 721; X32: ## BB#0: 722; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 723; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] 724; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] 725; X32-NEXT: retl 726; 727; X64-LABEL: i32_shuf_X0YC: 728; X64: ## BB#0: 729; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 730; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] 731; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] 732; X64-NEXT: retq 733 %vecext = extractelement <4 x i32> %x, i32 0 734 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 735 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 736 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 737 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 738 ret <4 x i32> %vecinit5 739} 740 741;; Test for a bug in the first implementation of LowerBuildVectorv4x32 742define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { 743; X32-LABEL: test_insertps_no_undef: 744; X32: ## BB#0: 745; X32-NEXT: xorps %xmm1, %xmm1 746; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] 747; X32-NEXT: maxps %xmm1, %xmm0 748; X32-NEXT: retl 749; 750; X64-LABEL: test_insertps_no_undef: 751; X64: ## BB#0: 752; X64-NEXT: xorps %xmm1, %xmm1 753; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] 754; X64-NEXT: maxps %xmm1, %xmm0 755; X64-NEXT: retq 756 %vecext = extractelement <4 x float> %x, i32 0 757 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 758 %vecext1 = extractelement <4 x float> %x, i32 1 759 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 760 %vecext3 = extractelement <4 x float> %x, i32 2 761 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 762 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 763 %mask = fcmp olt <4 x float> %vecinit5, %x 764 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 765 ret <4 x float> %res 766} 767 768define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { 769; X32-LABEL: blendvb_fallback: 770; X32: ## BB#0: 771; X32-NEXT: psllw $15, %xmm0 772; X32-NEXT: psraw $15, %xmm0 773; X32-NEXT: pblendvb %xmm1, %xmm2 774; X32-NEXT: movdqa %xmm2, %xmm0 775; X32-NEXT: retl 776; 777; X64-LABEL: blendvb_fallback: 778; X64: ## BB#0: 779; X64-NEXT: psllw $15, %xmm0 780; X64-NEXT: psraw $15, %xmm0 781; X64-NEXT: pblendvb %xmm1, %xmm2 782; X64-NEXT: movdqa %xmm2, %xmm0 783; X64-NEXT: retq 784 %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y 785 ret <8 x i16> %ret 786} 787 788; On X32, account for the argument's move to registers 789define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 790; X32-LABEL: insertps_from_vector_load: 791; X32: ## BB#0: 792; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 793; X32-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 794; X32-NEXT: retl 795; 796; X64-LABEL: insertps_from_vector_load: 797; X64: ## BB#0: 798; X64-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 799; X64-NEXT: retq 800 %1 = load <4 x float>, <4 x float>* %pb, align 16 801 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) 802 ret <4 x float> %2 803} 804 805;; Use a non-zero CountS for insertps 806;; Try to match a bit more of the instr, since we need the load's offset. 807define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { 808; X32-LABEL: insertps_from_vector_load_offset: 809; X32: ## BB#0: 810; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 811; X32-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 812; X32-NEXT: retl 813; 814; X64-LABEL: insertps_from_vector_load_offset: 815; X64: ## BB#0: 816; X64-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 817; X64-NEXT: retq 818 %1 = load <4 x float>, <4 x float>* %pb, align 16 819 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) 820 ret <4 x float> %2 821} 822 823;; Try to match a bit more of the instr, since we need the load's offset. 824define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { 825; X32-LABEL: insertps_from_vector_load_offset_2: 826; X32: ## BB#0: 827; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 828; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 829; X32-NEXT: shll $4, %ecx 830; X32-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] 831; X32-NEXT: retl 832; 833; X64-LABEL: insertps_from_vector_load_offset_2: 834; X64: ## BB#0: 835; X64-NEXT: shlq $4, %rsi 836; X64-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] 837; X64-NEXT: retq 838 %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index 839 %2 = load <4 x float>, <4 x float>* %1, align 16 840 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) 841 ret <4 x float> %3 842} 843 844define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { 845; X32-LABEL: insertps_from_broadcast_loadf32: 846; X32: ## BB#0: 847; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 848; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 849; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 850; X32-NEXT: retl 851; 852; X64-LABEL: insertps_from_broadcast_loadf32: 853; X64: ## BB#0: 854; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 855; X64-NEXT: retq 856 %1 = getelementptr inbounds float, float* %fb, i64 %index 857 %2 = load float, float* %1, align 4 858 %3 = insertelement <4 x float> undef, float %2, i32 0 859 %4 = insertelement <4 x float> %3, float %2, i32 1 860 %5 = insertelement <4 x float> %4, float %2, i32 2 861 %6 = insertelement <4 x float> %5, float %2, i32 3 862 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 863 ret <4 x float> %7 864} 865 866define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { 867; X32-LABEL: insertps_from_broadcast_loadv4f32: 868; X32: ## BB#0: 869; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 870; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 871; X32-NEXT: retl 872; 873; X64-LABEL: insertps_from_broadcast_loadv4f32: 874; X64: ## BB#0: 875; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 876; X64-NEXT: retq 877 %1 = load <4 x float>, <4 x float>* %b, align 4 878 %2 = extractelement <4 x float> %1, i32 0 879 %3 = insertelement <4 x float> undef, float %2, i32 0 880 %4 = insertelement <4 x float> %3, float %2, i32 1 881 %5 = insertelement <4 x float> %4, float %2, i32 2 882 %6 = insertelement <4 x float> %5, float %2, i32 3 883 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 884 ret <4 x float> %7 885} 886 887define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { 888; X32-LABEL: insertps_from_broadcast_multiple_use: 889; X32: ## BB#0: 890; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 891; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 892; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero 893; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] 894; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 895; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] 896; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 897; X32-NEXT: addps %xmm1, %xmm0 898; X32-NEXT: addps %xmm2, %xmm3 899; X32-NEXT: addps %xmm3, %xmm0 900; X32-NEXT: retl 901; 902; X64-LABEL: insertps_from_broadcast_multiple_use: 903; X64: ## BB#0: 904; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero 905; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] 906; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 907; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] 908; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 909; X64-NEXT: addps %xmm1, %xmm0 910; X64-NEXT: addps %xmm2, %xmm3 911; X64-NEXT: addps %xmm3, %xmm0 912; X64-NEXT: retq 913 %1 = getelementptr inbounds float, float* %fb, i64 %index 914 %2 = load float, float* %1, align 4 915 %3 = insertelement <4 x float> undef, float %2, i32 0 916 %4 = insertelement <4 x float> %3, float %2, i32 1 917 %5 = insertelement <4 x float> %4, float %2, i32 2 918 %6 = insertelement <4 x float> %5, float %2, i32 3 919 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 920 %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) 921 %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) 922 %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) 923 %11 = fadd <4 x float> %7, %8 924 %12 = fadd <4 x float> %9, %10 925 %13 = fadd <4 x float> %11, %12 926 ret <4 x float> %13 927} 928 929define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { 930; X32-LABEL: insertps_with_undefs: 931; X32: ## BB#0: 932; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 933; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 934; X32-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 935; X32-NEXT: movapd %xmm1, %xmm0 936; X32-NEXT: retl 937; 938; X64-LABEL: insertps_with_undefs: 939; X64: ## BB#0: 940; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 941; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 942; X64-NEXT: movapd %xmm1, %xmm0 943; X64-NEXT: retq 944 %1 = load float, float* %b, align 4 945 %2 = insertelement <4 x float> undef, float %1, i32 0 946 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7> 947 ret <4 x float> %result 948} 949 950; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using 951; the destination index to change the load, instead of the source index. 952define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { 953; X32-LABEL: pr20087: 954; X32: ## BB#0: 955; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 956; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] 957; X32-NEXT: retl 958; 959; X64-LABEL: pr20087: 960; X64: ## BB#0: 961; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] 962; X64-NEXT: retq 963 %load = load <4 x float> , <4 x float> *%ptr 964 %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2> 965 ret <4 x float> %ret 966} 967 968; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> 969define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 { 970; X32-LABEL: insertps_pr20411: 971; X32: ## BB#0: 972; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 973; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 974; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 975; X32-NEXT: movdqu %xmm1, (%eax) 976; X32-NEXT: retl 977; 978; X64-LABEL: insertps_pr20411: 979; X64: ## BB#0: 980; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 981; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 982; X64-NEXT: movdqu %xmm1, (%rdi) 983; X64-NEXT: retq 984 %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef> 985 %ptrcast = bitcast i32* %RET to <4 x i32>* 986 store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 987 ret void 988} 989 990define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { 991; X32-LABEL: insertps_4: 992; X32: ## BB#0: ## %entry 993; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero 994; X32-NEXT: retl 995; 996; X64-LABEL: insertps_4: 997; X64: ## BB#0: ## %entry 998; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero 999; X64-NEXT: retq 1000entry: 1001 %vecext = extractelement <4 x float> %A, i32 0 1002 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1003 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 1004 %vecext2 = extractelement <4 x float> %B, i32 2 1005 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 1006 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1007 ret <4 x float> %vecinit4 1008} 1009 1010define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) { 1011; X32-LABEL: insertps_5: 1012; X32: ## BB#0: ## %entry 1013; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero 1014; X32-NEXT: retl 1015; 1016; X64-LABEL: insertps_5: 1017; X64: ## BB#0: ## %entry 1018; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero 1019; X64-NEXT: retq 1020entry: 1021 %vecext = extractelement <4 x float> %A, i32 0 1022 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1023 %vecext1 = extractelement <4 x float> %B, i32 1 1024 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1025 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 1026 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1027 ret <4 x float> %vecinit4 1028} 1029 1030define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) { 1031; X32-LABEL: insertps_6: 1032; X32: ## BB#0: ## %entry 1033; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero 1034; X32-NEXT: retl 1035; 1036; X64-LABEL: insertps_6: 1037; X64: ## BB#0: ## %entry 1038; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero 1039; X64-NEXT: retq 1040entry: 1041 %vecext = extractelement <4 x float> %A, i32 1 1042 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 1043 %vecext1 = extractelement <4 x float> %B, i32 2 1044 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 1045 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 1046 ret <4 x float> %vecinit3 1047} 1048 1049define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) { 1050; X32-LABEL: insertps_7: 1051; X32: ## BB#0: ## %entry 1052; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero 1053; X32-NEXT: retl 1054; 1055; X64-LABEL: insertps_7: 1056; X64: ## BB#0: ## %entry 1057; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero 1058; X64-NEXT: retq 1059entry: 1060 %vecext = extractelement <4 x float> %A, i32 0 1061 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1062 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 1063 %vecext2 = extractelement <4 x float> %B, i32 1 1064 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 1065 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1066 ret <4 x float> %vecinit4 1067} 1068 1069define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) { 1070; X32-LABEL: insertps_8: 1071; X32: ## BB#0: ## %entry 1072; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero 1073; X32-NEXT: retl 1074; 1075; X64-LABEL: insertps_8: 1076; X64: ## BB#0: ## %entry 1077; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero 1078; X64-NEXT: retq 1079entry: 1080 %vecext = extractelement <4 x float> %A, i32 0 1081 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1082 %vecext1 = extractelement <4 x float> %B, i32 0 1083 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1084 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 1085 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1086 ret <4 x float> %vecinit4 1087} 1088 1089define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) { 1090; X32-LABEL: insertps_9: 1091; X32: ## BB#0: ## %entry 1092; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero 1093; X32-NEXT: movaps %xmm1, %xmm0 1094; X32-NEXT: retl 1095; 1096; X64-LABEL: insertps_9: 1097; X64: ## BB#0: ## %entry 1098; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero 1099; X64-NEXT: movaps %xmm1, %xmm0 1100; X64-NEXT: retq 1101entry: 1102 %vecext = extractelement <4 x float> %A, i32 0 1103 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 1104 %vecext1 = extractelement <4 x float> %B, i32 2 1105 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 1106 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 1107 ret <4 x float> %vecinit3 1108} 1109 1110define <4 x float> @insertps_10(<4 x float> %A) 1111; X32-LABEL: insertps_10: 1112; X32: ## BB#0: 1113; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero 1114; X32-NEXT: retl 1115; 1116; X64-LABEL: insertps_10: 1117; X64: ## BB#0: 1118; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero 1119; X64-NEXT: retq 1120{ 1121 %vecext = extractelement <4 x float> %A, i32 0 1122 %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0 1123 %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2 1124 ret <4 x float> %vecbuild2 1125} 1126 1127define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) { 1128; X32-LABEL: build_vector_to_shuffle_1: 1129; X32: ## BB#0: ## %entry 1130; X32-NEXT: xorps %xmm1, %xmm1 1131; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1132; X32-NEXT: retl 1133; 1134; X64-LABEL: build_vector_to_shuffle_1: 1135; X64: ## BB#0: ## %entry 1136; X64-NEXT: xorps %xmm1, %xmm1 1137; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1138; X64-NEXT: retq 1139entry: 1140 %vecext = extractelement <4 x float> %A, i32 1 1141 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 1142 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 1143 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1144 ret <4 x float> %vecinit3 1145} 1146 1147define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) { 1148; X32-LABEL: build_vector_to_shuffle_2: 1149; X32: ## BB#0: ## %entry 1150; X32-NEXT: xorps %xmm1, %xmm1 1151; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1152; X32-NEXT: retl 1153; 1154; X64-LABEL: build_vector_to_shuffle_2: 1155; X64: ## BB#0: ## %entry 1156; X64-NEXT: xorps %xmm1, %xmm1 1157; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1158; X64-NEXT: retq 1159entry: 1160 %vecext = extractelement <4 x float> %A, i32 1 1161 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 1162 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 1163 ret <4 x float> %vecinit1 1164} 1165