1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 5 6define <16 x i8> @mul8c(<16 x i8> %i) nounwind { 7; SSE2-LABEL: mul8c: 8; SSE2: # BB#0: # %entry 9; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 10; SSE2-NEXT: psraw $8, %xmm1 11; SSE2-NEXT: movdqa %xmm0, %xmm2 12; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13; SSE2-NEXT: psraw $8, %xmm2 14; SSE2-NEXT: pmullw %xmm1, %xmm2 15; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 16; SSE2-NEXT: pand %xmm3, %xmm2 17; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 18; SSE2-NEXT: psraw $8, %xmm0 19; SSE2-NEXT: pmullw %xmm1, %xmm0 20; SSE2-NEXT: pand %xmm3, %xmm0 21; SSE2-NEXT: packuswb %xmm2, %xmm0 22; SSE2-NEXT: retq 23; 24; SSE41-LABEL: mul8c: 25; SSE41: # BB#0: # %entry 26; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 27; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm2 28; SSE41-NEXT: pmullw %xmm2, %xmm1 29; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 30; SSE41-NEXT: pand %xmm3, %xmm1 31; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 32; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 33; SSE41-NEXT: pmullw %xmm2, %xmm0 34; SSE41-NEXT: pand %xmm3, %xmm0 35; SSE41-NEXT: packuswb %xmm0, %xmm1 36; SSE41-NEXT: movdqa %xmm1, %xmm0 37; SSE41-NEXT: retq 38; 39; AVX2-LABEL: mul8c: 40; AVX2: # BB#0: # %entry 41; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 42; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1 43; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 44; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 45; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 46; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 47; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 48; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 49; AVX2-NEXT: vzeroupper 50; AVX2-NEXT: retq 51entry: 52 %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 53 ret <16 x i8> %A 54} 55 56define <8 x i16> @mul16c(<8 x i16> %i) nounwind { 57; SSE-LABEL: mul16c: 58; SSE: # BB#0: # %entry 59; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 60; SSE-NEXT: retq 61; 62; AVX2-LABEL: mul16c: 63; AVX2: # BB#0: # %entry 64; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 65; AVX2-NEXT: retq 66entry: 67 %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > 68 ret <8 x i16> %A 69} 70 71define <4 x i32> @a(<4 x i32> %i) nounwind { 72; SSE2-LABEL: a: 73; SSE2: # BB#0: # %entry 74; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117] 75; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 76; SSE2-NEXT: pmuludq %xmm1, %xmm0 77; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 78; SSE2-NEXT: pmuludq %xmm1, %xmm2 79; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 80; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 81; SSE2-NEXT: retq 82; 83; SSE41-LABEL: a: 84; SSE41: # BB#0: # %entry 85; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 86; SSE41-NEXT: retq 87; 88; AVX2-LABEL: a: 89; AVX2: # BB#0: # %entry 90; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 91; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 92; AVX2-NEXT: retq 93entry: 94 %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 > 95 ret <4 x i32> %A 96} 97 98define <2 x i64> @b(<2 x i64> %i) nounwind { 99; SSE-LABEL: b: 100; SSE: # BB#0: # %entry 101; SSE-NEXT: movdqa {{.*#+}} xmm1 = [117,117] 102; SSE-NEXT: movdqa %xmm0, %xmm2 103; SSE-NEXT: pmuludq %xmm1, %xmm2 104; SSE-NEXT: psrlq $32, %xmm0 105; SSE-NEXT: pmuludq %xmm1, %xmm0 106; SSE-NEXT: psllq $32, %xmm0 107; SSE-NEXT: paddq %xmm2, %xmm0 108; SSE-NEXT: retq 109; 110; AVX2-LABEL: b: 111; AVX2: # BB#0: # %entry 112; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117] 113; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 114; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 115; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 116; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 117; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 118; AVX2-NEXT: retq 119entry: 120 %A = mul <2 x i64> %i, < i64 117, i64 117 > 121 ret <2 x i64> %A 122} 123 124define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind { 125; SSE2-LABEL: mul8: 126; SSE2: # BB#0: # %entry 127; SSE2-NEXT: movdqa %xmm1, %xmm2 128; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 129; SSE2-NEXT: psraw $8, %xmm2 130; SSE2-NEXT: movdqa %xmm0, %xmm3 131; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 132; SSE2-NEXT: psraw $8, %xmm3 133; SSE2-NEXT: pmullw %xmm2, %xmm3 134; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 135; SSE2-NEXT: pand %xmm2, %xmm3 136; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 137; SSE2-NEXT: psraw $8, %xmm1 138; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 139; SSE2-NEXT: psraw $8, %xmm0 140; SSE2-NEXT: pmullw %xmm1, %xmm0 141; SSE2-NEXT: pand %xmm2, %xmm0 142; SSE2-NEXT: packuswb %xmm3, %xmm0 143; SSE2-NEXT: retq 144; 145; SSE41-LABEL: mul8: 146; SSE41: # BB#0: # %entry 147; SSE41-NEXT: pmovsxbw %xmm1, %xmm3 148; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 149; SSE41-NEXT: pmullw %xmm3, %xmm2 150; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 151; SSE41-NEXT: pand %xmm3, %xmm2 152; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 153; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 154; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 155; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 156; SSE41-NEXT: pmullw %xmm1, %xmm0 157; SSE41-NEXT: pand %xmm3, %xmm0 158; SSE41-NEXT: packuswb %xmm0, %xmm2 159; SSE41-NEXT: movdqa %xmm2, %xmm0 160; SSE41-NEXT: retq 161; 162; AVX2-LABEL: mul8: 163; AVX2: # BB#0: # %entry 164; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 165; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 166; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 167; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 168; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 169; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 170; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 171; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 172; AVX2-NEXT: vzeroupper 173; AVX2-NEXT: retq 174entry: 175 %A = mul <16 x i8> %i, %j 176 ret <16 x i8> %A 177} 178 179define <8 x i16> @mul16(<8 x i16> %i, <8 x i16> %j) nounwind { 180; SSE-LABEL: mul16: 181; SSE: # BB#0: # %entry 182; SSE-NEXT: pmullw %xmm1, %xmm0 183; SSE-NEXT: retq 184; 185; AVX2-LABEL: mul16: 186; AVX2: # BB#0: # %entry 187; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 188; AVX2-NEXT: retq 189entry: 190 %A = mul <8 x i16> %i, %j 191 ret <8 x i16> %A 192} 193 194define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind { 195; SSE2-LABEL: c: 196; SSE2: # BB#0: # %entry 197; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 198; SSE2-NEXT: pmuludq %xmm1, %xmm0 199; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 200; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 201; SSE2-NEXT: pmuludq %xmm2, %xmm1 202; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 203; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 204; SSE2-NEXT: retq 205; 206; SSE41-LABEL: c: 207; SSE41: # BB#0: # %entry 208; SSE41-NEXT: pmulld %xmm1, %xmm0 209; SSE41-NEXT: retq 210; 211; AVX2-LABEL: c: 212; AVX2: # BB#0: # %entry 213; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 214; AVX2-NEXT: retq 215entry: 216 %A = mul <4 x i32> %i, %j 217 ret <4 x i32> %A 218} 219 220define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind { 221; SSE-LABEL: d: 222; SSE: # BB#0: # %entry 223; SSE-NEXT: movdqa %xmm0, %xmm2 224; SSE-NEXT: pmuludq %xmm1, %xmm2 225; SSE-NEXT: movdqa %xmm1, %xmm3 226; SSE-NEXT: psrlq $32, %xmm3 227; SSE-NEXT: pmuludq %xmm0, %xmm3 228; SSE-NEXT: psllq $32, %xmm3 229; SSE-NEXT: paddq %xmm3, %xmm2 230; SSE-NEXT: psrlq $32, %xmm0 231; SSE-NEXT: pmuludq %xmm1, %xmm0 232; SSE-NEXT: psllq $32, %xmm0 233; SSE-NEXT: paddq %xmm2, %xmm0 234; SSE-NEXT: retq 235; 236; AVX2-LABEL: d: 237; AVX2: # BB#0: # %entry 238; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 239; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 240; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 241; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3 242; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 243; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 244; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 245; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 246; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 247; AVX2-NEXT: retq 248entry: 249 %A = mul <2 x i64> %i, %j 250 ret <2 x i64> %A 251} 252 253declare void @foo() 254 255define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind { 256; SSE2-LABEL: e: 257; SSE2: # BB#0: # %entry 258; SSE2-NEXT: subq $40, %rsp 259; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 260; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 261; SSE2-NEXT: callq foo 262; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 263; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 264; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 265; SSE2-NEXT: pmuludq %xmm2, %xmm0 266; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 267; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 268; SSE2-NEXT: pmuludq %xmm1, %xmm2 269; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 270; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 271; SSE2-NEXT: addq $40, %rsp 272; SSE2-NEXT: retq 273; 274; SSE41-LABEL: e: 275; SSE41: # BB#0: # %entry 276; SSE41-NEXT: subq $40, %rsp 277; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 278; SSE41-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 279; SSE41-NEXT: callq foo 280; SSE41-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 281; SSE41-NEXT: pmulld {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload 282; SSE41-NEXT: addq $40, %rsp 283; SSE41-NEXT: retq 284; 285; AVX2-LABEL: e: 286; AVX2: # BB#0: # %entry 287; AVX2-NEXT: subq $40, %rsp 288; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 289; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 290; AVX2-NEXT: callq foo 291; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 292; AVX2-NEXT: vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 293; AVX2-NEXT: addq $40, %rsp 294; AVX2-NEXT: retq 295entry: 296 ; Use a call to force spills. 297 call void @foo() 298 %A = mul <4 x i32> %i, %j 299 ret <4 x i32> %A 300} 301 302define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind { 303; SSE-LABEL: f: 304; SSE: # BB#0: # %entry 305; SSE-NEXT: subq $40, %rsp 306; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 307; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 308; SSE-NEXT: callq foo 309; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 310; SSE-NEXT: movdqa %xmm0, %xmm2 311; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload 312; SSE-NEXT: pmuludq %xmm3, %xmm2 313; SSE-NEXT: movdqa %xmm3, %xmm1 314; SSE-NEXT: psrlq $32, %xmm1 315; SSE-NEXT: pmuludq %xmm0, %xmm1 316; SSE-NEXT: psllq $32, %xmm1 317; SSE-NEXT: paddq %xmm1, %xmm2 318; SSE-NEXT: psrlq $32, %xmm0 319; SSE-NEXT: pmuludq %xmm3, %xmm0 320; SSE-NEXT: psllq $32, %xmm0 321; SSE-NEXT: paddq %xmm2, %xmm0 322; SSE-NEXT: addq $40, %rsp 323; SSE-NEXT: retq 324; 325; AVX2-LABEL: f: 326; AVX2: # BB#0: # %entry 327; AVX2-NEXT: subq $40, %rsp 328; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 329; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 330; AVX2-NEXT: callq foo 331; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 332; AVX2-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload 333; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm0 334; AVX2-NEXT: vpsrlq $32, %xmm2, %xmm1 335; AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 336; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 337; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 338; AVX2-NEXT: vpsrlq $32, %xmm3, %xmm1 339; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 340; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 341; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 342; AVX2-NEXT: addq $40, %rsp 343; AVX2-NEXT: retq 344entry: 345 ; Use a call to force spills. 346 call void @foo() 347 %A = mul <2 x i64> %i, %j 348 ret <2 x i64> %A 349} 350 351define <4 x i64> @b1(<4 x i64> %i) nounwind { 352; SSE-LABEL: b1: 353; SSE: # BB#0: # %entry 354; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117] 355; SSE-NEXT: movdqa %xmm0, %xmm3 356; SSE-NEXT: pmuludq %xmm2, %xmm3 357; SSE-NEXT: psrlq $32, %xmm0 358; SSE-NEXT: pmuludq %xmm2, %xmm0 359; SSE-NEXT: psllq $32, %xmm0 360; SSE-NEXT: paddq %xmm3, %xmm0 361; SSE-NEXT: movdqa %xmm1, %xmm3 362; SSE-NEXT: pmuludq %xmm2, %xmm3 363; SSE-NEXT: psrlq $32, %xmm1 364; SSE-NEXT: pmuludq %xmm2, %xmm1 365; SSE-NEXT: psllq $32, %xmm1 366; SSE-NEXT: paddq %xmm3, %xmm1 367; SSE-NEXT: retq 368; 369; AVX2-LABEL: b1: 370; AVX2: # BB#0: # %entry 371; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 372; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 373; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 374; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 375; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 376; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 377; AVX2-NEXT: retq 378entry: 379 %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 > 380 ret <4 x i64> %A 381} 382 383define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind { 384; SSE-LABEL: b2: 385; SSE: # BB#0: # %entry 386; SSE-NEXT: movdqa %xmm0, %xmm4 387; SSE-NEXT: pmuludq %xmm2, %xmm4 388; SSE-NEXT: movdqa %xmm2, %xmm5 389; SSE-NEXT: psrlq $32, %xmm5 390; SSE-NEXT: pmuludq %xmm0, %xmm5 391; SSE-NEXT: psllq $32, %xmm5 392; SSE-NEXT: paddq %xmm5, %xmm4 393; SSE-NEXT: psrlq $32, %xmm0 394; SSE-NEXT: pmuludq %xmm2, %xmm0 395; SSE-NEXT: psllq $32, %xmm0 396; SSE-NEXT: paddq %xmm4, %xmm0 397; SSE-NEXT: movdqa %xmm1, %xmm2 398; SSE-NEXT: pmuludq %xmm3, %xmm2 399; SSE-NEXT: movdqa %xmm3, %xmm4 400; SSE-NEXT: psrlq $32, %xmm4 401; SSE-NEXT: pmuludq %xmm1, %xmm4 402; SSE-NEXT: psllq $32, %xmm4 403; SSE-NEXT: paddq %xmm4, %xmm2 404; SSE-NEXT: psrlq $32, %xmm1 405; SSE-NEXT: pmuludq %xmm3, %xmm1 406; SSE-NEXT: psllq $32, %xmm1 407; SSE-NEXT: paddq %xmm2, %xmm1 408; SSE-NEXT: retq 409; 410; AVX2-LABEL: b2: 411; AVX2: # BB#0: # %entry 412; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 413; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 414; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 415; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 416; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 417; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 418; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 419; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 420; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 421; AVX2-NEXT: retq 422entry: 423 %A = mul <4 x i64> %i, %j 424 ret <4 x i64> %A 425} 426 427