• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
7
8define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
9; SSE2-LABEL: mulhuw_v4i16:
10; SSE2:       # %bb.0:
11; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
12; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
13; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
14; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
15; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
16; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
17; SSE2-NEXT:    pmulhuw %xmm1, %xmm0
18; SSE2-NEXT:    pxor %xmm1, %xmm1
19; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
20; SSE2-NEXT:    retq
21;
22; SSE41-LABEL: mulhuw_v4i16:
23; SSE41:       # %bb.0:
24; SSE41-NEXT:    pxor %xmm2, %xmm2
25; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
26; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
27; SSE41-NEXT:    pmulld %xmm1, %xmm0
28; SSE41-NEXT:    psrld $16, %xmm0
29; SSE41-NEXT:    retq
30;
31; AVX-LABEL: mulhuw_v4i16:
32; AVX:       # %bb.0:
33; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
34; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
35; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
36; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
37; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
38; AVX-NEXT:    retq
39  %a1 = zext <4 x i16> %a to <4 x i32>
40  %b1 = zext <4 x i16> %b to <4 x i32>
41  %c = mul <4 x i32> %a1, %b1
42  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
43  %e = trunc <4 x i32> %d to <4 x i16>
44  ret <4 x i16> %e
45}
46
47define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
48; SSE2-LABEL: mulhw_v4i16:
49; SSE2:       # %bb.0:
50; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
51; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
52; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
53; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
54; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
55; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
56; SSE2-NEXT:    pmulhw %xmm1, %xmm0
57; SSE2-NEXT:    pxor %xmm1, %xmm1
58; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
59; SSE2-NEXT:    retq
60;
61; SSE41-LABEL: mulhw_v4i16:
62; SSE41:       # %bb.0:
63; SSE41-NEXT:    pslld $16, %xmm0
64; SSE41-NEXT:    psrad $16, %xmm0
65; SSE41-NEXT:    pslld $16, %xmm1
66; SSE41-NEXT:    psrad $16, %xmm1
67; SSE41-NEXT:    pmulld %xmm1, %xmm0
68; SSE41-NEXT:    psrld $16, %xmm0
69; SSE41-NEXT:    retq
70;
71; AVX-LABEL: mulhw_v4i16:
72; AVX:       # %bb.0:
73; AVX-NEXT:    vpslld $16, %xmm0, %xmm0
74; AVX-NEXT:    vpsrad $16, %xmm0, %xmm0
75; AVX-NEXT:    vpslld $16, %xmm1, %xmm1
76; AVX-NEXT:    vpsrad $16, %xmm1, %xmm1
77; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
78; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
79; AVX-NEXT:    retq
80  %a1 = sext <4 x i16> %a to <4 x i32>
81  %b1 = sext <4 x i16> %b to <4 x i32>
82  %c = mul <4 x i32> %a1, %b1
83  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
84  %e = trunc <4 x i32> %d to <4 x i16>
85  ret <4 x i16> %e
86}
87
88define <8 x i16> @mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
89; SSE-LABEL: mulhuw_v8i16:
90; SSE:       # %bb.0:
91; SSE-NEXT:    pmulhuw %xmm1, %xmm0
92; SSE-NEXT:    retq
93;
94; AVX-LABEL: mulhuw_v8i16:
95; AVX:       # %bb.0:
96; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
97; AVX-NEXT:    retq
98  %a1 = zext <8 x i16> %a to <8 x i32>
99  %b1 = zext <8 x i16> %b to <8 x i32>
100  %c = mul <8 x i32> %a1, %b1
101  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
102  %e = trunc <8 x i32> %d to <8 x i16>
103  ret <8 x i16> %e
104}
105
106define <8 x i16> @mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
107; SSE-LABEL: mulhw_v8i16:
108; SSE:       # %bb.0:
109; SSE-NEXT:    pmulhw %xmm1, %xmm0
110; SSE-NEXT:    retq
111;
112; AVX-LABEL: mulhw_v8i16:
113; AVX:       # %bb.0:
114; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
115; AVX-NEXT:    retq
116  %a1 = sext <8 x i16> %a to <8 x i32>
117  %b1 = sext <8 x i16> %b to <8 x i32>
118  %c = mul <8 x i32> %a1, %b1
119  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
120  %e = trunc <8 x i32> %d to <8 x i16>
121  ret <8 x i16> %e
122}
123
124define <16 x i16> @mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
125; SSE-LABEL: mulhuw_v16i16:
126; SSE:       # %bb.0:
127; SSE-NEXT:    pmulhuw %xmm2, %xmm0
128; SSE-NEXT:    pmulhuw %xmm3, %xmm1
129; SSE-NEXT:    retq
130;
131; AVX-LABEL: mulhuw_v16i16:
132; AVX:       # %bb.0:
133; AVX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
134; AVX-NEXT:    retq
135  %a1 = zext <16 x i16> %a to <16 x i32>
136  %b1 = zext <16 x i16> %b to <16 x i32>
137  %c = mul <16 x i32> %a1, %b1
138  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
139  %e = trunc <16 x i32> %d to <16 x i16>
140  ret <16 x i16> %e
141}
142
143define <16 x i16> @mulhw_v16i16(<16 x i16> %a, <16 x i16> %b) {
144; SSE-LABEL: mulhw_v16i16:
145; SSE:       # %bb.0:
146; SSE-NEXT:    pmulhw %xmm2, %xmm0
147; SSE-NEXT:    pmulhw %xmm3, %xmm1
148; SSE-NEXT:    retq
149;
150; AVX-LABEL: mulhw_v16i16:
151; AVX:       # %bb.0:
152; AVX-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
153; AVX-NEXT:    retq
154  %a1 = sext <16 x i16> %a to <16 x i32>
155  %b1 = sext <16 x i16> %b to <16 x i32>
156  %c = mul <16 x i32> %a1, %b1
157  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
158  %e = trunc <16 x i32> %d to <16 x i16>
159  ret <16 x i16> %e
160}
161
162define <32 x i16> @mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
163; SSE-LABEL: mulhuw_v32i16:
164; SSE:       # %bb.0:
165; SSE-NEXT:    pmulhuw %xmm4, %xmm0
166; SSE-NEXT:    pmulhuw %xmm5, %xmm1
167; SSE-NEXT:    pmulhuw %xmm6, %xmm2
168; SSE-NEXT:    pmulhuw %xmm7, %xmm3
169; SSE-NEXT:    retq
170;
171; AVX2-LABEL: mulhuw_v32i16:
172; AVX2:       # %bb.0:
173; AVX2-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
174; AVX2-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm1
175; AVX2-NEXT:    retq
176;
177; AVX512F-LABEL: mulhuw_v32i16:
178; AVX512F:       # %bb.0:
179; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
180; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm1
181; AVX512F-NEXT:    retq
182;
183; AVX512BW-LABEL: mulhuw_v32i16:
184; AVX512BW:       # %bb.0:
185; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
186; AVX512BW-NEXT:    retq
187  %a1 = zext <32 x i16> %a to <32 x i32>
188  %b1 = zext <32 x i16> %b to <32 x i32>
189  %c = mul <32 x i32> %a1, %b1
190  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
191  %e = trunc <32 x i32> %d to <32 x i16>
192  ret <32 x i16> %e
193}
194
195define <32 x i16> @mulhw_v32i16(<32 x i16> %a, <32 x i16> %b) {
196; SSE-LABEL: mulhw_v32i16:
197; SSE:       # %bb.0:
198; SSE-NEXT:    pmulhw %xmm4, %xmm0
199; SSE-NEXT:    pmulhw %xmm5, %xmm1
200; SSE-NEXT:    pmulhw %xmm6, %xmm2
201; SSE-NEXT:    pmulhw %xmm7, %xmm3
202; SSE-NEXT:    retq
203;
204; AVX2-LABEL: mulhw_v32i16:
205; AVX2:       # %bb.0:
206; AVX2-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
207; AVX2-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
208; AVX2-NEXT:    retq
209;
210; AVX512F-LABEL: mulhw_v32i16:
211; AVX512F:       # %bb.0:
212; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
213; AVX512F-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
214; AVX512F-NEXT:    retq
215;
216; AVX512BW-LABEL: mulhw_v32i16:
217; AVX512BW:       # %bb.0:
218; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
219; AVX512BW-NEXT:    retq
220  %a1 = sext <32 x i16> %a to <32 x i32>
221  %b1 = sext <32 x i16> %b to <32 x i32>
222  %c = mul <32 x i32> %a1, %b1
223  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
224  %e = trunc <32 x i32> %d to <32 x i16>
225  ret <32 x i16> %e
226}
227
228define <64 x i16> @mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
229; SSE-LABEL: mulhuw_v64i16:
230; SSE:       # %bb.0:
231; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm0
232; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm1
233; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm2
234; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm3
235; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm4
236; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm5
237; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm6
238; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm7
239; SSE-NEXT:    movdqa %xmm7, 112(%rdi)
240; SSE-NEXT:    movdqa %xmm6, 96(%rdi)
241; SSE-NEXT:    movdqa %xmm5, 80(%rdi)
242; SSE-NEXT:    movdqa %xmm4, 64(%rdi)
243; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
244; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
245; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
246; SSE-NEXT:    movdqa %xmm0, (%rdi)
247; SSE-NEXT:    movq %rdi, %rax
248; SSE-NEXT:    retq
249;
250; AVX2-LABEL: mulhuw_v64i16:
251; AVX2:       # %bb.0:
252; AVX2-NEXT:    vpmulhuw %ymm4, %ymm0, %ymm0
253; AVX2-NEXT:    vpmulhuw %ymm5, %ymm1, %ymm1
254; AVX2-NEXT:    vpmulhuw %ymm6, %ymm2, %ymm2
255; AVX2-NEXT:    vpmulhuw %ymm7, %ymm3, %ymm3
256; AVX2-NEXT:    retq
257;
258; AVX512F-LABEL: mulhuw_v64i16:
259; AVX512F:       # %bb.0:
260; AVX512F-NEXT:    vpmulhuw %ymm4, %ymm0, %ymm0
261; AVX512F-NEXT:    vpmulhuw %ymm5, %ymm1, %ymm1
262; AVX512F-NEXT:    vpmulhuw %ymm6, %ymm2, %ymm2
263; AVX512F-NEXT:    vpmulhuw %ymm7, %ymm3, %ymm3
264; AVX512F-NEXT:    retq
265;
266; AVX512BW-LABEL: mulhuw_v64i16:
267; AVX512BW:       # %bb.0:
268; AVX512BW-NEXT:    vpmulhuw %zmm2, %zmm0, %zmm0
269; AVX512BW-NEXT:    vpmulhuw %zmm3, %zmm1, %zmm1
270; AVX512BW-NEXT:    retq
271  %a1 = zext <64 x i16> %a to <64 x i32>
272  %b1 = zext <64 x i16> %b to <64 x i32>
273  %c = mul <64 x i32> %a1, %b1
274  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
275  %e = trunc <64 x i32> %d to <64 x i16>
276  ret <64 x i16> %e
277}
278
279define <64 x i16> @mulhw_v64i16(<64 x i16> %a, <64 x i16> %b) {
280; SSE-LABEL: mulhw_v64i16:
281; SSE:       # %bb.0:
282; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
283; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
284; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
285; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
286; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
287; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
288; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
289; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm7
290; SSE-NEXT:    movdqa %xmm7, 112(%rdi)
291; SSE-NEXT:    movdqa %xmm6, 96(%rdi)
292; SSE-NEXT:    movdqa %xmm5, 80(%rdi)
293; SSE-NEXT:    movdqa %xmm4, 64(%rdi)
294; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
295; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
296; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
297; SSE-NEXT:    movdqa %xmm0, (%rdi)
298; SSE-NEXT:    movq %rdi, %rax
299; SSE-NEXT:    retq
300;
301; AVX2-LABEL: mulhw_v64i16:
302; AVX2:       # %bb.0:
303; AVX2-NEXT:    vpmulhw %ymm4, %ymm0, %ymm0
304; AVX2-NEXT:    vpmulhw %ymm5, %ymm1, %ymm1
305; AVX2-NEXT:    vpmulhw %ymm6, %ymm2, %ymm2
306; AVX2-NEXT:    vpmulhw %ymm7, %ymm3, %ymm3
307; AVX2-NEXT:    retq
308;
309; AVX512F-LABEL: mulhw_v64i16:
310; AVX512F:       # %bb.0:
311; AVX512F-NEXT:    vpmulhw %ymm4, %ymm0, %ymm0
312; AVX512F-NEXT:    vpmulhw %ymm5, %ymm1, %ymm1
313; AVX512F-NEXT:    vpmulhw %ymm6, %ymm2, %ymm2
314; AVX512F-NEXT:    vpmulhw %ymm7, %ymm3, %ymm3
315; AVX512F-NEXT:    retq
316;
317; AVX512BW-LABEL: mulhw_v64i16:
318; AVX512BW:       # %bb.0:
319; AVX512BW-NEXT:    vpmulhw %zmm2, %zmm0, %zmm0
320; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm1, %zmm1
321; AVX512BW-NEXT:    retq
322  %a1 = sext <64 x i16> %a to <64 x i32>
323  %b1 = sext <64 x i16> %b to <64 x i32>
324  %c = mul <64 x i32> %a1, %b1
325  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
326  %e = trunc <64 x i32> %d to <64 x i16>
327  ret <64 x i16> %e
328}
329