1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 7 8define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) { 9; SSE2-LABEL: mulhuw_v4i16: 10; SSE2: # %bb.0: 11; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 12; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 13; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 14; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 15; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 16; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 17; SSE2-NEXT: pmulhuw %xmm1, %xmm0 18; SSE2-NEXT: pxor %xmm1, %xmm1 19; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 20; SSE2-NEXT: retq 21; 22; SSE41-LABEL: mulhuw_v4i16: 23; SSE41: # %bb.0: 24; SSE41-NEXT: pxor %xmm2, %xmm2 25; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 26; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 27; SSE41-NEXT: pmulld %xmm1, %xmm0 28; SSE41-NEXT: psrld $16, %xmm0 29; SSE41-NEXT: retq 30; 31; AVX-LABEL: mulhuw_v4i16: 32; AVX: # %bb.0: 33; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 34; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 35; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 36; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 37; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 38; AVX-NEXT: retq 39 %a1 = zext <4 x i16> %a to <4 x i32> 40 %b1 = zext <4 x i16> %b to <4 x i32> 41 %c = mul <4 x i32> %a1, %b1 42 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 43 %e = trunc <4 x i32> %d to <4 x i16> 44 ret <4 x i16> %e 45} 46 47define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) { 48; SSE2-LABEL: mulhw_v4i16: 49; SSE2: # %bb.0: 50; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 51; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 52; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 53; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 54; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 55; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 56; SSE2-NEXT: pmulhw %xmm1, %xmm0 57; SSE2-NEXT: pxor %xmm1, %xmm1 58; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 59; SSE2-NEXT: retq 60; 61; SSE41-LABEL: mulhw_v4i16: 62; SSE41: # %bb.0: 63; SSE41-NEXT: pslld $16, %xmm0 64; SSE41-NEXT: psrad $16, %xmm0 65; SSE41-NEXT: pslld $16, %xmm1 66; SSE41-NEXT: psrad $16, %xmm1 67; SSE41-NEXT: pmulld %xmm1, %xmm0 68; SSE41-NEXT: psrld $16, %xmm0 69; SSE41-NEXT: retq 70; 71; AVX-LABEL: mulhw_v4i16: 72; AVX: # %bb.0: 73; AVX-NEXT: vpslld $16, %xmm0, %xmm0 74; AVX-NEXT: vpsrad $16, %xmm0, %xmm0 75; AVX-NEXT: vpslld $16, %xmm1, %xmm1 76; AVX-NEXT: vpsrad $16, %xmm1, %xmm1 77; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 78; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 79; AVX-NEXT: retq 80 %a1 = sext <4 x i16> %a to <4 x i32> 81 %b1 = sext <4 x i16> %b to <4 x i32> 82 %c = mul <4 x i32> %a1, %b1 83 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 84 %e = trunc <4 x i32> %d to <4 x i16> 85 ret <4 x i16> %e 86} 87 88define <8 x i16> @mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) { 89; SSE-LABEL: mulhuw_v8i16: 90; SSE: # %bb.0: 91; SSE-NEXT: pmulhuw %xmm1, %xmm0 92; SSE-NEXT: retq 93; 94; AVX-LABEL: mulhuw_v8i16: 95; AVX: # %bb.0: 96; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 97; AVX-NEXT: retq 98 %a1 = zext <8 x i16> %a to <8 x i32> 99 %b1 = zext <8 x i16> %b to <8 x i32> 100 %c = mul <8 x i32> %a1, %b1 101 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 102 %e = trunc <8 x i32> %d to <8 x i16> 103 ret <8 x i16> %e 104} 105 106define <8 x i16> @mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) { 107; SSE-LABEL: mulhw_v8i16: 108; SSE: # %bb.0: 109; SSE-NEXT: pmulhw %xmm1, %xmm0 110; SSE-NEXT: retq 111; 112; AVX-LABEL: mulhw_v8i16: 113; AVX: # %bb.0: 114; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 115; AVX-NEXT: retq 116 %a1 = sext <8 x i16> %a to <8 x i32> 117 %b1 = sext <8 x i16> %b to <8 x i32> 118 %c = mul <8 x i32> %a1, %b1 119 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 120 %e = trunc <8 x i32> %d to <8 x i16> 121 ret <8 x i16> %e 122} 123 124define <16 x i16> @mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) { 125; SSE-LABEL: mulhuw_v16i16: 126; SSE: # %bb.0: 127; SSE-NEXT: pmulhuw %xmm2, %xmm0 128; SSE-NEXT: pmulhuw %xmm3, %xmm1 129; SSE-NEXT: retq 130; 131; AVX-LABEL: mulhuw_v16i16: 132; AVX: # %bb.0: 133; AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 134; AVX-NEXT: retq 135 %a1 = zext <16 x i16> %a to <16 x i32> 136 %b1 = zext <16 x i16> %b to <16 x i32> 137 %c = mul <16 x i32> %a1, %b1 138 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 139 %e = trunc <16 x i32> %d to <16 x i16> 140 ret <16 x i16> %e 141} 142 143define <16 x i16> @mulhw_v16i16(<16 x i16> %a, <16 x i16> %b) { 144; SSE-LABEL: mulhw_v16i16: 145; SSE: # %bb.0: 146; SSE-NEXT: pmulhw %xmm2, %xmm0 147; SSE-NEXT: pmulhw %xmm3, %xmm1 148; SSE-NEXT: retq 149; 150; AVX-LABEL: mulhw_v16i16: 151; AVX: # %bb.0: 152; AVX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 153; AVX-NEXT: retq 154 %a1 = sext <16 x i16> %a to <16 x i32> 155 %b1 = sext <16 x i16> %b to <16 x i32> 156 %c = mul <16 x i32> %a1, %b1 157 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 158 %e = trunc <16 x i32> %d to <16 x i16> 159 ret <16 x i16> %e 160} 161 162define <32 x i16> @mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) { 163; SSE-LABEL: mulhuw_v32i16: 164; SSE: # %bb.0: 165; SSE-NEXT: pmulhuw %xmm4, %xmm0 166; SSE-NEXT: pmulhuw %xmm5, %xmm1 167; SSE-NEXT: pmulhuw %xmm6, %xmm2 168; SSE-NEXT: pmulhuw %xmm7, %xmm3 169; SSE-NEXT: retq 170; 171; AVX2-LABEL: mulhuw_v32i16: 172; AVX2: # %bb.0: 173; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 174; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 175; AVX2-NEXT: retq 176; 177; AVX512F-LABEL: mulhuw_v32i16: 178; AVX512F: # %bb.0: 179; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 180; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 181; AVX512F-NEXT: retq 182; 183; AVX512BW-LABEL: mulhuw_v32i16: 184; AVX512BW: # %bb.0: 185; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 186; AVX512BW-NEXT: retq 187 %a1 = zext <32 x i16> %a to <32 x i32> 188 %b1 = zext <32 x i16> %b to <32 x i32> 189 %c = mul <32 x i32> %a1, %b1 190 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 191 %e = trunc <32 x i32> %d to <32 x i16> 192 ret <32 x i16> %e 193} 194 195define <32 x i16> @mulhw_v32i16(<32 x i16> %a, <32 x i16> %b) { 196; SSE-LABEL: mulhw_v32i16: 197; SSE: # %bb.0: 198; SSE-NEXT: pmulhw %xmm4, %xmm0 199; SSE-NEXT: pmulhw %xmm5, %xmm1 200; SSE-NEXT: pmulhw %xmm6, %xmm2 201; SSE-NEXT: pmulhw %xmm7, %xmm3 202; SSE-NEXT: retq 203; 204; AVX2-LABEL: mulhw_v32i16: 205; AVX2: # %bb.0: 206; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 207; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 208; AVX2-NEXT: retq 209; 210; AVX512F-LABEL: mulhw_v32i16: 211; AVX512F: # %bb.0: 212; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 213; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 214; AVX512F-NEXT: retq 215; 216; AVX512BW-LABEL: mulhw_v32i16: 217; AVX512BW: # %bb.0: 218; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 219; AVX512BW-NEXT: retq 220 %a1 = sext <32 x i16> %a to <32 x i32> 221 %b1 = sext <32 x i16> %b to <32 x i32> 222 %c = mul <32 x i32> %a1, %b1 223 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 224 %e = trunc <32 x i32> %d to <32 x i16> 225 ret <32 x i16> %e 226} 227 228define <64 x i16> @mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) { 229; SSE-LABEL: mulhuw_v64i16: 230; SSE: # %bb.0: 231; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 232; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 233; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 234; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 235; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 236; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 237; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 238; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7 239; SSE-NEXT: movdqa %xmm7, 112(%rdi) 240; SSE-NEXT: movdqa %xmm6, 96(%rdi) 241; SSE-NEXT: movdqa %xmm5, 80(%rdi) 242; SSE-NEXT: movdqa %xmm4, 64(%rdi) 243; SSE-NEXT: movdqa %xmm3, 48(%rdi) 244; SSE-NEXT: movdqa %xmm2, 32(%rdi) 245; SSE-NEXT: movdqa %xmm1, 16(%rdi) 246; SSE-NEXT: movdqa %xmm0, (%rdi) 247; SSE-NEXT: movq %rdi, %rax 248; SSE-NEXT: retq 249; 250; AVX2-LABEL: mulhuw_v64i16: 251; AVX2: # %bb.0: 252; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0 253; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1 254; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2 255; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3 256; AVX2-NEXT: retq 257; 258; AVX512F-LABEL: mulhuw_v64i16: 259; AVX512F: # %bb.0: 260; AVX512F-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0 261; AVX512F-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1 262; AVX512F-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2 263; AVX512F-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3 264; AVX512F-NEXT: retq 265; 266; AVX512BW-LABEL: mulhuw_v64i16: 267; AVX512BW: # %bb.0: 268; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm0 269; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1 270; AVX512BW-NEXT: retq 271 %a1 = zext <64 x i16> %a to <64 x i32> 272 %b1 = zext <64 x i16> %b to <64 x i32> 273 %c = mul <64 x i32> %a1, %b1 274 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 275 %e = trunc <64 x i32> %d to <64 x i16> 276 ret <64 x i16> %e 277} 278 279define <64 x i16> @mulhw_v64i16(<64 x i16> %a, <64 x i16> %b) { 280; SSE-LABEL: mulhw_v64i16: 281; SSE: # %bb.0: 282; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 283; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 284; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 285; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 286; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 287; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 288; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 289; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 290; SSE-NEXT: movdqa %xmm7, 112(%rdi) 291; SSE-NEXT: movdqa %xmm6, 96(%rdi) 292; SSE-NEXT: movdqa %xmm5, 80(%rdi) 293; SSE-NEXT: movdqa %xmm4, 64(%rdi) 294; SSE-NEXT: movdqa %xmm3, 48(%rdi) 295; SSE-NEXT: movdqa %xmm2, 32(%rdi) 296; SSE-NEXT: movdqa %xmm1, 16(%rdi) 297; SSE-NEXT: movdqa %xmm0, (%rdi) 298; SSE-NEXT: movq %rdi, %rax 299; SSE-NEXT: retq 300; 301; AVX2-LABEL: mulhw_v64i16: 302; AVX2: # %bb.0: 303; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0 304; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1 305; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2 306; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3 307; AVX2-NEXT: retq 308; 309; AVX512F-LABEL: mulhw_v64i16: 310; AVX512F: # %bb.0: 311; AVX512F-NEXT: vpmulhw %ymm4, %ymm0, %ymm0 312; AVX512F-NEXT: vpmulhw %ymm5, %ymm1, %ymm1 313; AVX512F-NEXT: vpmulhw %ymm6, %ymm2, %ymm2 314; AVX512F-NEXT: vpmulhw %ymm7, %ymm3, %ymm3 315; AVX512F-NEXT: retq 316; 317; AVX512BW-LABEL: mulhw_v64i16: 318; AVX512BW: # %bb.0: 319; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm0 320; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1 321; AVX512BW-NEXT: retq 322 %a1 = sext <64 x i16> %a to <64 x i32> 323 %b1 = sext <64 x i16> %b to <64 x i32> 324 %c = mul <64 x i32> %a1, %b1 325 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 326 %e = trunc <64 x i32> %d to <64 x i16> 327 ret <64 x i16> %e 328} 329