1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5 6define <16 x i8> @load8_ins_elt0_v16i8(i8* %p) nounwind { 7; SSE-LABEL: load8_ins_elt0_v16i8: 8; SSE: # %bb.0: 9; SSE-NEXT: movzbl (%rdi), %eax 10; SSE-NEXT: movd %eax, %xmm0 11; SSE-NEXT: retq 12; 13; AVX1-LABEL: load8_ins_elt0_v16i8: 14; AVX1: # %bb.0: 15; AVX1-NEXT: movzbl (%rdi), %eax 16; AVX1-NEXT: vmovd %eax, %xmm0 17; AVX1-NEXT: retq 18; 19; AVX2-LABEL: load8_ins_elt0_v16i8: 20; AVX2: # %bb.0: 21; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 22; AVX2-NEXT: retq 23 %x = load i8, i8* %p 24 %ins = insertelement <16 x i8> undef, i8 %x, i32 0 25 ret <16 x i8> %ins 26} 27 28define <8 x i16> @load16_ins_elt0_v8i16(i16* %p) nounwind { 29; SSE-LABEL: load16_ins_elt0_v8i16: 30; SSE: # %bb.0: 31; SSE-NEXT: movzwl (%rdi), %eax 32; SSE-NEXT: movd %eax, %xmm0 33; SSE-NEXT: retq 34; 35; AVX1-LABEL: load16_ins_elt0_v8i16: 36; AVX1: # %bb.0: 37; AVX1-NEXT: movzwl (%rdi), %eax 38; AVX1-NEXT: vmovd %eax, %xmm0 39; AVX1-NEXT: retq 40; 41; AVX2-LABEL: load16_ins_elt0_v8i16: 42; AVX2: # %bb.0: 43; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 44; AVX2-NEXT: retq 45 %x = load i16, i16* %p 46 %ins = insertelement <8 x i16> undef, i16 %x, i32 0 47 ret <8 x i16> %ins 48} 49 50define <4 x i32> @load32_ins_elt0_v4i32(i32* %p) nounwind { 51; SSE-LABEL: load32_ins_elt0_v4i32: 52; SSE: # %bb.0: 53; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 54; SSE-NEXT: retq 55; 56; AVX-LABEL: load32_ins_elt0_v4i32: 57; AVX: # %bb.0: 58; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 59; AVX-NEXT: retq 60 %x = load i32, i32* %p 61 %ins = insertelement <4 x i32> undef, i32 %x, i32 0 62 ret <4 x i32> %ins 63} 64 65define <2 x i64> @load64_ins_elt0_v2i64(i64* %p) nounwind { 66; SSE-LABEL: load64_ins_elt0_v2i64: 67; SSE: # %bb.0: 68; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 69; SSE-NEXT: retq 70; 71; AVX-LABEL: load64_ins_elt0_v2i64: 72; AVX: # %bb.0: 73; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 74; AVX-NEXT: retq 75 %x = load i64, i64* %p 76 %ins = insertelement <2 x i64> undef, i64 %x, i32 0 77 ret <2 x i64> %ins 78} 79 80define <4 x float> @load32_ins_elt0_v4f32(float* %p) nounwind { 81; SSE-LABEL: load32_ins_elt0_v4f32: 82; SSE: # %bb.0: 83; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 84; SSE-NEXT: retq 85; 86; AVX-LABEL: load32_ins_elt0_v4f32: 87; AVX: # %bb.0: 88; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 89; AVX-NEXT: retq 90 %x = load float, float* %p 91 %ins = insertelement <4 x float> undef, float %x, i32 0 92 ret <4 x float> %ins 93} 94 95define <2 x double> @load64_ins_elt0_v2f64(double* %p) nounwind { 96; SSE-LABEL: load64_ins_elt0_v2f64: 97; SSE: # %bb.0: 98; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 99; SSE-NEXT: retq 100; 101; AVX-LABEL: load64_ins_elt0_v2f64: 102; AVX: # %bb.0: 103; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 104; AVX-NEXT: retq 105 %x = load double, double* %p 106 %ins = insertelement <2 x double> undef, double %x, i32 0 107 ret <2 x double> %ins 108} 109 110define <16 x i8> @load8_ins_eltc_v16i8(i8* %p) nounwind { 111; SSE-LABEL: load8_ins_eltc_v16i8: 112; SSE: # %bb.0: 113; SSE-NEXT: movzbl (%rdi), %eax 114; SSE-NEXT: movd %eax, %xmm0 115; SSE-NEXT: pslld $24, %xmm0 116; SSE-NEXT: retq 117; 118; AVX1-LABEL: load8_ins_eltc_v16i8: 119; AVX1: # %bb.0: 120; AVX1-NEXT: movzbl (%rdi), %eax 121; AVX1-NEXT: vmovd %eax, %xmm0 122; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 123; AVX1-NEXT: retq 124; 125; AVX2-LABEL: load8_ins_eltc_v16i8: 126; AVX2: # %bb.0: 127; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 128; AVX2-NEXT: retq 129 %x = load i8, i8* %p 130 %ins = insertelement <16 x i8> undef, i8 %x, i32 3 131 ret <16 x i8> %ins 132} 133 134define <8 x i16> @load16_ins_eltc_v8i16(i16* %p) nounwind { 135; SSE-LABEL: load16_ins_eltc_v8i16: 136; SSE: # %bb.0: 137; SSE-NEXT: movzwl (%rdi), %eax 138; SSE-NEXT: movd %eax, %xmm0 139; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 140; SSE-NEXT: retq 141; 142; AVX1-LABEL: load16_ins_eltc_v8i16: 143; AVX1: # %bb.0: 144; AVX1-NEXT: movzwl (%rdi), %eax 145; AVX1-NEXT: vmovd %eax, %xmm0 146; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 147; AVX1-NEXT: retq 148; 149; AVX2-LABEL: load16_ins_eltc_v8i16: 150; AVX2: # %bb.0: 151; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 152; AVX2-NEXT: retq 153 %x = load i16, i16* %p 154 %ins = insertelement <8 x i16> undef, i16 %x, i32 5 155 ret <8 x i16> %ins 156} 157 158define <4 x i32> @load32_ins_eltc_v4i32(i32* %p) nounwind { 159; SSE-LABEL: load32_ins_eltc_v4i32: 160; SSE: # %bb.0: 161; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 162; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 163; SSE-NEXT: retq 164; 165; AVX-LABEL: load32_ins_eltc_v4i32: 166; AVX: # %bb.0: 167; AVX-NEXT: vbroadcastss (%rdi), %xmm0 168; AVX-NEXT: retq 169 %x = load i32, i32* %p 170 %ins = insertelement <4 x i32> undef, i32 %x, i32 2 171 ret <4 x i32> %ins 172} 173 174define <2 x i64> @load64_ins_eltc_v2i64(i64* %p) nounwind { 175; SSE-LABEL: load64_ins_eltc_v2i64: 176; SSE: # %bb.0: 177; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 178; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 179; SSE-NEXT: retq 180; 181; AVX-LABEL: load64_ins_eltc_v2i64: 182; AVX: # %bb.0: 183; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 184; AVX-NEXT: retq 185 %x = load i64, i64* %p 186 %ins = insertelement <2 x i64> undef, i64 %x, i32 1 187 ret <2 x i64> %ins 188} 189 190define <4 x float> @load32_ins_eltc_v4f32(float* %p) nounwind { 191; SSE-LABEL: load32_ins_eltc_v4f32: 192; SSE: # %bb.0: 193; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 194; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 195; SSE-NEXT: retq 196; 197; AVX-LABEL: load32_ins_eltc_v4f32: 198; AVX: # %bb.0: 199; AVX-NEXT: vbroadcastss (%rdi), %xmm0 200; AVX-NEXT: retq 201 %x = load float, float* %p 202 %ins = insertelement <4 x float> undef, float %x, i32 3 203 ret <4 x float> %ins 204} 205 206define <2 x double> @load64_ins_eltc_v2f64(double* %p) nounwind { 207; SSE-LABEL: load64_ins_eltc_v2f64: 208; SSE: # %bb.0: 209; SSE-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] 210; SSE-NEXT: retq 211; 212; AVX-LABEL: load64_ins_eltc_v2f64: 213; AVX: # %bb.0: 214; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 215; AVX-NEXT: retq 216 %x = load double, double* %p 217 %ins = insertelement <2 x double> undef, double %x, i32 1 218 ret <2 x double> %ins 219} 220 221define <32 x i8> @load8_ins_elt0_v32i8(i8* %p) nounwind { 222; SSE-LABEL: load8_ins_elt0_v32i8: 223; SSE: # %bb.0: 224; SSE-NEXT: movzbl (%rdi), %eax 225; SSE-NEXT: movd %eax, %xmm0 226; SSE-NEXT: retq 227; 228; AVX1-LABEL: load8_ins_elt0_v32i8: 229; AVX1: # %bb.0: 230; AVX1-NEXT: movzbl (%rdi), %eax 231; AVX1-NEXT: vmovd %eax, %xmm0 232; AVX1-NEXT: retq 233; 234; AVX2-LABEL: load8_ins_elt0_v32i8: 235; AVX2: # %bb.0: 236; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 237; AVX2-NEXT: retq 238 %x = load i8, i8* %p 239 %ins = insertelement <32 x i8> undef, i8 %x, i32 0 240 ret <32 x i8> %ins 241} 242 243define <16 x i16> @load16_ins_elt0_v16i16(i16* %p) nounwind { 244; SSE-LABEL: load16_ins_elt0_v16i16: 245; SSE: # %bb.0: 246; SSE-NEXT: movzwl (%rdi), %eax 247; SSE-NEXT: movd %eax, %xmm0 248; SSE-NEXT: retq 249; 250; AVX1-LABEL: load16_ins_elt0_v16i16: 251; AVX1: # %bb.0: 252; AVX1-NEXT: movzwl (%rdi), %eax 253; AVX1-NEXT: vmovd %eax, %xmm0 254; AVX1-NEXT: retq 255; 256; AVX2-LABEL: load16_ins_elt0_v16i16: 257; AVX2: # %bb.0: 258; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 259; AVX2-NEXT: retq 260 %x = load i16, i16* %p 261 %ins = insertelement <16 x i16> undef, i16 %x, i32 0 262 ret <16 x i16> %ins 263} 264 265define <8 x i32> @load32_ins_elt0_v8i32(i32* %p) nounwind { 266; SSE-LABEL: load32_ins_elt0_v8i32: 267; SSE: # %bb.0: 268; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 269; SSE-NEXT: retq 270; 271; AVX-LABEL: load32_ins_elt0_v8i32: 272; AVX: # %bb.0: 273; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 274; AVX-NEXT: retq 275 %x = load i32, i32* %p 276 %ins = insertelement <8 x i32> undef, i32 %x, i32 0 277 ret <8 x i32> %ins 278} 279 280define <4 x i64> @load64_ins_elt0_v4i64(i64* %p) nounwind { 281; SSE-LABEL: load64_ins_elt0_v4i64: 282; SSE: # %bb.0: 283; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 284; SSE-NEXT: retq 285; 286; AVX-LABEL: load64_ins_elt0_v4i64: 287; AVX: # %bb.0: 288; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 289; AVX-NEXT: retq 290 %x = load i64, i64* %p 291 %ins = insertelement <4 x i64> undef, i64 %x, i32 0 292 ret <4 x i64> %ins 293} 294 295define <8 x float> @load32_ins_elt0_v8f32(float* %p) nounwind { 296; SSE-LABEL: load32_ins_elt0_v8f32: 297; SSE: # %bb.0: 298; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 299; SSE-NEXT: retq 300; 301; AVX-LABEL: load32_ins_elt0_v8f32: 302; AVX: # %bb.0: 303; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 304; AVX-NEXT: retq 305 %x = load float, float* %p 306 %ins = insertelement <8 x float> undef, float %x, i32 0 307 ret <8 x float> %ins 308} 309 310define <4 x double> @load64_ins_elt0_v4f64(double* %p) nounwind { 311; SSE-LABEL: load64_ins_elt0_v4f64: 312; SSE: # %bb.0: 313; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 314; SSE-NEXT: retq 315; 316; AVX-LABEL: load64_ins_elt0_v4f64: 317; AVX: # %bb.0: 318; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 319; AVX-NEXT: retq 320 %x = load double, double* %p 321 %ins = insertelement <4 x double> undef, double %x, i32 0 322 ret <4 x double> %ins 323} 324 325define <32 x i8> @load8_ins_eltc_v32i8(i8* %p) nounwind { 326; SSE-LABEL: load8_ins_eltc_v32i8: 327; SSE: # %bb.0: 328; SSE-NEXT: movzbl (%rdi), %eax 329; SSE-NEXT: movd %eax, %xmm1 330; SSE-NEXT: psllq $40, %xmm1 331; SSE-NEXT: retq 332; 333; AVX1-LABEL: load8_ins_eltc_v32i8: 334; AVX1: # %bb.0: 335; AVX1-NEXT: movzbl (%rdi), %eax 336; AVX1-NEXT: vmovd %eax, %xmm0 337; AVX1-NEXT: vpsllq $40, %xmm0, %xmm0 338; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 339; AVX1-NEXT: retq 340; 341; AVX2-LABEL: load8_ins_eltc_v32i8: 342; AVX2: # %bb.0: 343; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 344; AVX2-NEXT: retq 345 %x = load i8, i8* %p 346 %ins = insertelement <32 x i8> undef, i8 %x, i32 21 347 ret <32 x i8> %ins 348} 349 350define <16 x i16> @load16_ins_eltc_v16i16(i16* %p) nounwind { 351; SSE-LABEL: load16_ins_eltc_v16i16: 352; SSE: # %bb.0: 353; SSE-NEXT: movzwl (%rdi), %eax 354; SSE-NEXT: movd %eax, %xmm1 355; SSE-NEXT: psllq $48, %xmm1 356; SSE-NEXT: retq 357; 358; AVX1-LABEL: load16_ins_eltc_v16i16: 359; AVX1: # %bb.0: 360; AVX1-NEXT: movzwl (%rdi), %eax 361; AVX1-NEXT: vmovd %eax, %xmm0 362; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 363; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 364; AVX1-NEXT: retq 365; 366; AVX2-LABEL: load16_ins_eltc_v16i16: 367; AVX2: # %bb.0: 368; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 369; AVX2-NEXT: retq 370 %x = load i16, i16* %p 371 %ins = insertelement <16 x i16> undef, i16 %x, i32 11 372 ret <16 x i16> %ins 373} 374 375define <8 x i32> @load32_ins_eltc_v8i32(i32* %p) nounwind { 376; SSE-LABEL: load32_ins_eltc_v8i32: 377; SSE: # %bb.0: 378; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 379; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] 380; SSE-NEXT: retq 381; 382; AVX-LABEL: load32_ins_eltc_v8i32: 383; AVX: # %bb.0: 384; AVX-NEXT: vbroadcastss (%rdi), %ymm0 385; AVX-NEXT: retq 386 %x = load i32, i32* %p 387 %ins = insertelement <8 x i32> undef, i32 %x, i32 7 388 ret <8 x i32> %ins 389} 390 391define <4 x i64> @load64_ins_eltc_v4i64(i64* %p) nounwind { 392; SSE-LABEL: load64_ins_eltc_v4i64: 393; SSE: # %bb.0: 394; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 395; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 396; SSE-NEXT: retq 397; 398; AVX-LABEL: load64_ins_eltc_v4i64: 399; AVX: # %bb.0: 400; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 401; AVX-NEXT: retq 402 %x = load i64, i64* %p 403 %ins = insertelement <4 x i64> undef, i64 %x, i32 3 404 ret <4 x i64> %ins 405} 406 407define <8 x float> @load32_ins_eltc_v8f32(float* %p) nounwind { 408; SSE-LABEL: load32_ins_eltc_v8f32: 409; SSE: # %bb.0: 410; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 411; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] 412; SSE-NEXT: retq 413; 414; AVX-LABEL: load32_ins_eltc_v8f32: 415; AVX: # %bb.0: 416; AVX-NEXT: vbroadcastss (%rdi), %ymm0 417; AVX-NEXT: retq 418 %x = load float, float* %p 419 %ins = insertelement <8 x float> undef, float %x, i32 5 420 ret <8 x float> %ins 421} 422 423define <4 x double> @load64_ins_eltc_v4f64(double* %p) nounwind { 424; SSE-LABEL: load64_ins_eltc_v4f64: 425; SSE: # %bb.0: 426; SSE-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] 427; SSE-NEXT: retq 428; 429; AVX-LABEL: load64_ins_eltc_v4f64: 430; AVX: # %bb.0: 431; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 432; AVX-NEXT: retq 433 %x = load double, double* %p 434 %ins = insertelement <4 x double> undef, double %x, i32 3 435 ret <4 x double> %ins 436} 437 438