• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK
3
4define <4 x i32> @test_pmaddwd_v8i16_add_v4i32(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
5; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32:
6; CHECK:       # %bb.0:
7; CHECK-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0
8; CHECK-NEXT:    retq
9  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
10  %2 = add <4 x i32> %1, %a0
11  ret <4 x i32> %2
12}
13
14define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
15; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32_commute:
16; CHECK:       # %bb.0:
17; CHECK-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0
18; CHECK-NEXT:    retq
19  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
20  %2 = add <4 x i32> %a0, %1
21  ret <4 x i32> %2
22}
23
24define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_load1(<4 x i32> %a0, <8 x i16>* %p1, <8 x i16> %a2) {
25; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32_load1:
26; CHECK:       # %bb.0:
27; CHECK-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm0
28; CHECK-NEXT:    retq
29  %a1 = load <8 x i16>, <8 x i16>* %p1
30  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
31  %2 = add <4 x i32> %1, %a0
32  ret <4 x i32> %2
33}
34
35define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_load2(<4 x i32> %a0, <8 x i16> %a1, <8 x i16>* %p2) {
36; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32_load2:
37; CHECK:       # %bb.0:
38; CHECK-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm0
39; CHECK-NEXT:    retq
40  %a2 = load <8 x i16>, <8 x i16>* %p2
41  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
42  %2 = add <4 x i32> %1, %a0
43  ret <4 x i32> %2
44}
45
46define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute_load1(<4 x i32> %a0, <8 x i16>* %p1, <8 x i16> %a2) {
47; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load1:
48; CHECK:       # %bb.0:
49; CHECK-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm0
50; CHECK-NEXT:    retq
51  %a1 = load <8 x i16>, <8 x i16>* %p1
52  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
53  %2 = add <4 x i32> %a0, %1
54  ret <4 x i32> %2
55}
56
57define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute_load2(<4 x i32> %a0, <8 x i16> %a1, <8 x i16>* %p2) {
58; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load2:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    vpdpwssd (%rdi), %xmm1, %xmm0
61; CHECK-NEXT:    retq
62  %a2 = load <8 x i16>, <8 x i16>* %p2
63  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2)
64  %2 = add <4 x i32> %a0, %1
65  ret <4 x i32> %2
66}
67
68define <8 x i32> @test_pmaddwd_v16i16_add_v8i32(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
69; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32:
70; CHECK:       # %bb.0:
71; CHECK-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0
72; CHECK-NEXT:    retq
73  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
74  %2 = add <8 x i32> %1, %a0
75  ret <8 x i32> %2
76}
77
78define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
79; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32_commute:
80; CHECK:       # %bb.0:
81; CHECK-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0
82; CHECK-NEXT:    retq
83  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
84  %2 = add <8 x i32> %a0, %1
85  ret <8 x i32> %2
86}
87
88define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_load1(<8 x i32> %a0, <16 x i16>* %p1, <16 x i16> %a2) {
89; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32_load1:
90; CHECK:       # %bb.0:
91; CHECK-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm0
92; CHECK-NEXT:    retq
93  %a1 = load <16 x i16>, <16 x i16>* %p1
94  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
95  %2 = add <8 x i32> %1, %a0
96  ret <8 x i32> %2
97}
98
99define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_load2(<8 x i32> %a0, <16 x i16> %a1, <16 x i16>* %p2) {
100; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32_load2:
101; CHECK:       # %bb.0:
102; CHECK-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm0
103; CHECK-NEXT:    retq
104  %a2 = load <16 x i16>, <16 x i16>* %p2
105  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
106  %2 = add <8 x i32> %1, %a0
107  ret <8 x i32> %2
108}
109
110define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute_load1(<8 x i32> %a0, <16 x i16>* %p1, <16 x i16> %a2) {
111; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load1:
112; CHECK:       # %bb.0:
113; CHECK-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm0
114; CHECK-NEXT:    retq
115  %a1 = load <16 x i16>, <16 x i16>* %p1
116  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
117  %2 = add <8 x i32> %a0, %1
118  ret <8 x i32> %2
119}
120
121define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute_load2(<8 x i32> %a0, <16 x i16> %a1, <16 x i16>* %p2) {
122; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load2:
123; CHECK:       # %bb.0:
124; CHECK-NEXT:    vpdpwssd (%rdi), %ymm1, %ymm0
125; CHECK-NEXT:    retq
126  %a2 = load <16 x i16>, <16 x i16>* %p2
127  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2)
128  %2 = add <8 x i32> %a0, %1
129  ret <8 x i32> %2
130}
131
132define <16 x i32> @test_pmaddwd_v32i16_add_v16i32(<16 x i32> %a0, <32 x i16> %a1, <32 x i16> %a2) {
133; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32:
134; CHECK:       # %bb.0:
135; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0
136; CHECK-NEXT:    retq
137  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2)
138  %2 = add <16 x i32> %1, %a0
139  ret <16 x i32> %2
140}
141
142define <16 x i32> @test_pmaddwd_v32i16_add_v16i32_commute(<16 x i32> %a0, <32 x i16> %a1, <32 x i16> %a2) {
143; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32_commute:
144; CHECK:       # %bb.0:
145; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0
146; CHECK-NEXT:    retq
147  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2)
148  %2 = add <16 x i32> %a0, %1
149  ret <16 x i32> %2
150}
151
152define <16 x i32> @test_pmaddwd_v32i16_add_v16i32_load1(<16 x i32> %a0, <32 x i16>* %p1, <32 x i16> %a2) {
153; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32_load1:
154; CHECK:       # %bb.0:
155; CHECK-NEXT:    vpdpwssd (%rdi), %zmm1, %zmm0
156; CHECK-NEXT:    retq
157  %a1 = load <32 x i16>, <32 x i16>* %p1
158  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2)
159  %2 = add <16 x i32> %1, %a0
160  ret <16 x i32> %2
161}
162
163define <16 x i32> @test_pmaddwd_v32i16_add_v16i32_load2(<16 x i32> %a0, <32 x i16> %a1, <32 x i16>* %p2) {
164; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32_load2:
165; CHECK:       # %bb.0:
166; CHECK-NEXT:    vpdpwssd (%rdi), %zmm1, %zmm0
167; CHECK-NEXT:    retq
168  %a2 = load <32 x i16>, <32 x i16>* %p2
169  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2)
170  %2 = add <16 x i32> %1, %a0
171  ret <16 x i32> %2
172}
173
174define <16 x i32> @test_pmaddwd_v32i16_add_v16i32_commute_load1(<16 x i32> %a0, <32 x i16>* %p1, <32 x i16> %a2) {
175; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32_commute_load1:
176; CHECK:       # %bb.0:
177; CHECK-NEXT:    vpdpwssd (%rdi), %zmm1, %zmm0
178; CHECK-NEXT:    retq
179  %a1 = load <32 x i16>, <32 x i16>* %p1
180  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2)
181  %2 = add <16 x i32> %a0, %1
182  ret <16 x i32> %2
183}
184
185define <16 x i32> @test_pmaddwd_v32i16_add_v16i32_commute_load2(<16 x i32> %a0, <32 x i16> %a1, <32 x i16>* %p2) {
186; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32_commute_load2:
187; CHECK:       # %bb.0:
188; CHECK-NEXT:    vpdpwssd (%rdi), %zmm1, %zmm0
189; CHECK-NEXT:    retq
190  %a2 = load <32 x i16>, <32 x i16>* %p2
191  %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2)
192  %2 = add <16 x i32> %a0, %1
193  ret <16 x i32> %2
194}
195
196declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
197declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
198declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
199