• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=SSE32
3; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=SSE64
4; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVXONLY32
5; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVXONLY64
6; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=KNL32
7; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=KNL64
8; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=SKX32
9; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=SKX64
10
11define i32 @test_store_32(i32* nocapture %addr, i32 %value) {
12; ALL32-LABEL: test_store_32:
13; ALL32:       # %bb.0: # %entry
14; ALL32-NEXT:    movl %esi, (%rdi)
15; ALL32-NEXT:    movl %esi, %eax
16; ALL32-NEXT:    retq
17;
18; ALL64-LABEL: test_store_32:
19; ALL64:       # %bb.0: # %entry
20; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %eax
21; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
22; ALL64-NEXT:    movl %eax, (%ecx)
23; ALL64-NEXT:    retl
24entry:
25  store i32 %value, i32* %addr, align 1
26  ret i32 %value
27}
28
29define i16 @test_store_16(i16* nocapture %addr, i16 %value) {
30; ALL32-LABEL: test_store_16:
31; ALL32:       # %bb.0: # %entry
32; ALL32-NEXT:    movw %si, (%rdi)
33; ALL32-NEXT:    movl %esi, %eax
34; ALL32-NEXT:    retq
35;
36; ALL64-LABEL: test_store_16:
37; ALL64:       # %bb.0: # %entry
38; ALL64-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
39; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
40; ALL64-NEXT:    movw %ax, (%ecx)
41; ALL64-NEXT:    retl
42entry:
43  store i16 %value, i16* %addr, align 1
44  ret i16 %value
45}
46
47define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
48; SSE32-LABEL: test_store_4xi32:
49; SSE32:       # %bb.0:
50; SSE32-NEXT:    paddd %xmm1, %xmm0
51; SSE32-NEXT:    movdqu %xmm0, (%rdi)
52; SSE32-NEXT:    retq
53;
54; SSE64-LABEL: test_store_4xi32:
55; SSE64:       # %bb.0:
56; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
57; SSE64-NEXT:    paddd %xmm1, %xmm0
58; SSE64-NEXT:    movdqu %xmm0, (%eax)
59; SSE64-NEXT:    retl
60;
61; AVX32-LABEL: test_store_4xi32:
62; AVX32:       # %bb.0:
63; AVX32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
64; AVX32-NEXT:    vmovdqu %xmm0, (%rdi)
65; AVX32-NEXT:    retq
66;
67; AVX64-LABEL: test_store_4xi32:
68; AVX64:       # %bb.0:
69; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
70; AVX64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
71; AVX64-NEXT:    vmovdqu %xmm0, (%eax)
72; AVX64-NEXT:    retl
73  %foo = add <4 x i32> %value, %value2 ; to force integer type on store
74  store <4 x i32> %foo, <4 x i32>* %addr, align 1
75  ret <4 x i32> %foo
76}
77
78define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
79; SSE32-LABEL: test_store_4xi32_aligned:
80; SSE32:       # %bb.0:
81; SSE32-NEXT:    paddd %xmm1, %xmm0
82; SSE32-NEXT:    movdqa %xmm0, (%rdi)
83; SSE32-NEXT:    retq
84;
85; SSE64-LABEL: test_store_4xi32_aligned:
86; SSE64:       # %bb.0:
87; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
88; SSE64-NEXT:    paddd %xmm1, %xmm0
89; SSE64-NEXT:    movdqa %xmm0, (%eax)
90; SSE64-NEXT:    retl
91;
92; AVX32-LABEL: test_store_4xi32_aligned:
93; AVX32:       # %bb.0:
94; AVX32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
95; AVX32-NEXT:    vmovdqa %xmm0, (%rdi)
96; AVX32-NEXT:    retq
97;
98; AVX64-LABEL: test_store_4xi32_aligned:
99; AVX64:       # %bb.0:
100; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
101; AVX64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
102; AVX64-NEXT:    vmovdqa %xmm0, (%eax)
103; AVX64-NEXT:    retl
104  %foo = add <4 x i32> %value, %value2 ; to force integer type on store
105  store <4 x i32> %foo, <4 x i32>* %addr, align 16
106  ret <4 x i32> %foo
107}
108
109define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) {
110; SSE32-LABEL: test_store_4xf32:
111; SSE32:       # %bb.0:
112; SSE32-NEXT:    movups %xmm0, (%rdi)
113; SSE32-NEXT:    retq
114;
115; SSE64-LABEL: test_store_4xf32:
116; SSE64:       # %bb.0:
117; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
118; SSE64-NEXT:    movups %xmm0, (%eax)
119; SSE64-NEXT:    retl
120;
121; AVX32-LABEL: test_store_4xf32:
122; AVX32:       # %bb.0:
123; AVX32-NEXT:    vmovups %xmm0, (%rdi)
124; AVX32-NEXT:    retq
125;
126; AVX64-LABEL: test_store_4xf32:
127; AVX64:       # %bb.0:
128; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
129; AVX64-NEXT:    vmovups %xmm0, (%eax)
130; AVX64-NEXT:    retl
131  store <4 x float> %value, <4 x float>* %addr, align 1
132  ret <4 x float> %value
133}
134
135define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) {
136; SSE32-LABEL: test_store_4xf32_aligned:
137; SSE32:       # %bb.0:
138; SSE32-NEXT:    movaps %xmm0, (%rdi)
139; SSE32-NEXT:    retq
140;
141; SSE64-LABEL: test_store_4xf32_aligned:
142; SSE64:       # %bb.0:
143; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
144; SSE64-NEXT:    movaps %xmm0, (%eax)
145; SSE64-NEXT:    retl
146;
147; AVX32-LABEL: test_store_4xf32_aligned:
148; AVX32:       # %bb.0:
149; AVX32-NEXT:    vmovaps %xmm0, (%rdi)
150; AVX32-NEXT:    retq
151;
152; AVX64-LABEL: test_store_4xf32_aligned:
153; AVX64:       # %bb.0:
154; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
155; AVX64-NEXT:    vmovaps %xmm0, (%eax)
156; AVX64-NEXT:    retl
157  store <4 x float> %value, <4 x float>* %addr, align 16
158  ret <4 x float> %value
159}
160
161define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
162; SSE32-LABEL: test_store_2xf64:
163; SSE32:       # %bb.0:
164; SSE32-NEXT:    addpd %xmm1, %xmm0
165; SSE32-NEXT:    movupd %xmm0, (%rdi)
166; SSE32-NEXT:    retq
167;
168; SSE64-LABEL: test_store_2xf64:
169; SSE64:       # %bb.0:
170; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
171; SSE64-NEXT:    addpd %xmm1, %xmm0
172; SSE64-NEXT:    movupd %xmm0, (%eax)
173; SSE64-NEXT:    retl
174;
175; AVX32-LABEL: test_store_2xf64:
176; AVX32:       # %bb.0:
177; AVX32-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
178; AVX32-NEXT:    vmovupd %xmm0, (%rdi)
179; AVX32-NEXT:    retq
180;
181; AVX64-LABEL: test_store_2xf64:
182; AVX64:       # %bb.0:
183; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
184; AVX64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
185; AVX64-NEXT:    vmovupd %xmm0, (%eax)
186; AVX64-NEXT:    retl
187  %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
188  store <2 x double> %foo, <2 x double>* %addr, align 1
189  ret <2 x double> %foo
190}
191
192define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
193; SSE32-LABEL: test_store_2xf64_aligned:
194; SSE32:       # %bb.0:
195; SSE32-NEXT:    addpd %xmm1, %xmm0
196; SSE32-NEXT:    movapd %xmm0, (%rdi)
197; SSE32-NEXT:    retq
198;
199; SSE64-LABEL: test_store_2xf64_aligned:
200; SSE64:       # %bb.0:
201; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
202; SSE64-NEXT:    addpd %xmm1, %xmm0
203; SSE64-NEXT:    movapd %xmm0, (%eax)
204; SSE64-NEXT:    retl
205;
206; AVX32-LABEL: test_store_2xf64_aligned:
207; AVX32:       # %bb.0:
208; AVX32-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
209; AVX32-NEXT:    vmovapd %xmm0, (%rdi)
210; AVX32-NEXT:    retq
211;
212; AVX64-LABEL: test_store_2xf64_aligned:
213; AVX64:       # %bb.0:
214; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
215; AVX64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
216; AVX64-NEXT:    vmovapd %xmm0, (%eax)
217; AVX64-NEXT:    retl
218  %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
219  store <2 x double> %foo, <2 x double>* %addr, align 16
220  ret <2 x double> %foo
221}
222
223define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) {
224; SSE32-LABEL: test_store_8xi32:
225; SSE32:       # %bb.0:
226; SSE32-NEXT:    movups %xmm0, (%rdi)
227; SSE32-NEXT:    movups %xmm1, 16(%rdi)
228; SSE32-NEXT:    retq
229;
230; SSE64-LABEL: test_store_8xi32:
231; SSE64:       # %bb.0:
232; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
233; SSE64-NEXT:    movups %xmm0, (%eax)
234; SSE64-NEXT:    movups %xmm1, 16(%eax)
235; SSE64-NEXT:    retl
236;
237; AVX32-LABEL: test_store_8xi32:
238; AVX32:       # %bb.0:
239; AVX32-NEXT:    vmovups %ymm0, (%rdi)
240; AVX32-NEXT:    retq
241;
242; AVX64-LABEL: test_store_8xi32:
243; AVX64:       # %bb.0:
244; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
245; AVX64-NEXT:    vmovups %ymm0, (%eax)
246; AVX64-NEXT:    retl
247  store <8 x i32> %value, <8 x i32>* %addr, align 1
248  ret <8 x i32> %value
249}
250
251define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) {
252; SSE32-LABEL: test_store_8xi32_aligned:
253; SSE32:       # %bb.0:
254; SSE32-NEXT:    movaps %xmm0, (%rdi)
255; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
256; SSE32-NEXT:    retq
257;
258; SSE64-LABEL: test_store_8xi32_aligned:
259; SSE64:       # %bb.0:
260; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
261; SSE64-NEXT:    movaps %xmm0, (%eax)
262; SSE64-NEXT:    movaps %xmm1, 16(%eax)
263; SSE64-NEXT:    retl
264;
265; AVX32-LABEL: test_store_8xi32_aligned:
266; AVX32:       # %bb.0:
267; AVX32-NEXT:    vmovaps %ymm0, (%rdi)
268; AVX32-NEXT:    retq
269;
270; AVX64-LABEL: test_store_8xi32_aligned:
271; AVX64:       # %bb.0:
272; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
273; AVX64-NEXT:    vmovaps %ymm0, (%eax)
274; AVX64-NEXT:    retl
275  store <8 x i32> %value, <8 x i32>* %addr, align 32
276  ret <8 x i32> %value
277}
278
279define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) {
280; SSE32-LABEL: test_store_8xf32:
281; SSE32:       # %bb.0:
282; SSE32-NEXT:    movups %xmm0, (%rdi)
283; SSE32-NEXT:    movups %xmm1, 16(%rdi)
284; SSE32-NEXT:    retq
285;
286; SSE64-LABEL: test_store_8xf32:
287; SSE64:       # %bb.0:
288; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
289; SSE64-NEXT:    movups %xmm0, (%eax)
290; SSE64-NEXT:    movups %xmm1, 16(%eax)
291; SSE64-NEXT:    retl
292;
293; AVX32-LABEL: test_store_8xf32:
294; AVX32:       # %bb.0:
295; AVX32-NEXT:    vmovups %ymm0, (%rdi)
296; AVX32-NEXT:    retq
297;
298; AVX64-LABEL: test_store_8xf32:
299; AVX64:       # %bb.0:
300; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
301; AVX64-NEXT:    vmovups %ymm0, (%eax)
302; AVX64-NEXT:    retl
303  store <8 x float> %value, <8 x float>* %addr, align 1
304  ret <8 x float> %value
305}
306
307define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) {
308; SSE32-LABEL: test_store_8xf32_aligned:
309; SSE32:       # %bb.0:
310; SSE32-NEXT:    movaps %xmm0, (%rdi)
311; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
312; SSE32-NEXT:    retq
313;
314; SSE64-LABEL: test_store_8xf32_aligned:
315; SSE64:       # %bb.0:
316; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
317; SSE64-NEXT:    movaps %xmm0, (%eax)
318; SSE64-NEXT:    movaps %xmm1, 16(%eax)
319; SSE64-NEXT:    retl
320;
321; AVX32-LABEL: test_store_8xf32_aligned:
322; AVX32:       # %bb.0:
323; AVX32-NEXT:    vmovaps %ymm0, (%rdi)
324; AVX32-NEXT:    retq
325;
326; AVX64-LABEL: test_store_8xf32_aligned:
327; AVX64:       # %bb.0:
328; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
329; AVX64-NEXT:    vmovaps %ymm0, (%eax)
330; AVX64-NEXT:    retl
331  store <8 x float> %value, <8 x float>* %addr, align 32
332  ret <8 x float> %value
333}
334
335define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
336; SSE32-LABEL: test_store_4xf64:
337; SSE32:       # %bb.0:
338; SSE32-NEXT:    addpd %xmm3, %xmm1
339; SSE32-NEXT:    addpd %xmm2, %xmm0
340; SSE32-NEXT:    movupd %xmm0, (%rdi)
341; SSE32-NEXT:    movupd %xmm1, 16(%rdi)
342; SSE32-NEXT:    retq
343;
344; SSE64-LABEL: test_store_4xf64:
345; SSE64:       # %bb.0:
346; SSE64-NEXT:    subl $12, %esp
347; SSE64-NEXT:    .cfi_def_cfa_offset 16
348; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
349; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
350; SSE64-NEXT:    addpd %xmm2, %xmm0
351; SSE64-NEXT:    movupd %xmm0, (%eax)
352; SSE64-NEXT:    movupd %xmm1, 16(%eax)
353; SSE64-NEXT:    addl $12, %esp
354; SSE64-NEXT:    .cfi_def_cfa_offset 4
355; SSE64-NEXT:    retl
356;
357; AVX32-LABEL: test_store_4xf64:
358; AVX32:       # %bb.0:
359; AVX32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
360; AVX32-NEXT:    vmovupd %ymm0, (%rdi)
361; AVX32-NEXT:    retq
362;
363; AVX64-LABEL: test_store_4xf64:
364; AVX64:       # %bb.0:
365; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
366; AVX64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
367; AVX64-NEXT:    vmovupd %ymm0, (%eax)
368; AVX64-NEXT:    retl
369  %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
370  store <4 x double> %foo, <4 x double>* %addr, align 1
371  ret <4 x double> %foo
372}
373
374define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
375; SSE32-LABEL: test_store_4xf64_aligned:
376; SSE32:       # %bb.0:
377; SSE32-NEXT:    addpd %xmm3, %xmm1
378; SSE32-NEXT:    addpd %xmm2, %xmm0
379; SSE32-NEXT:    movapd %xmm0, (%rdi)
380; SSE32-NEXT:    movapd %xmm1, 16(%rdi)
381; SSE32-NEXT:    retq
382;
383; SSE64-LABEL: test_store_4xf64_aligned:
384; SSE64:       # %bb.0:
385; SSE64-NEXT:    subl $12, %esp
386; SSE64-NEXT:    .cfi_def_cfa_offset 16
387; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
388; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
389; SSE64-NEXT:    addpd %xmm2, %xmm0
390; SSE64-NEXT:    movapd %xmm0, (%eax)
391; SSE64-NEXT:    movapd %xmm1, 16(%eax)
392; SSE64-NEXT:    addl $12, %esp
393; SSE64-NEXT:    .cfi_def_cfa_offset 4
394; SSE64-NEXT:    retl
395;
396; AVX32-LABEL: test_store_4xf64_aligned:
397; AVX32:       # %bb.0:
398; AVX32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
399; AVX32-NEXT:    vmovapd %ymm0, (%rdi)
400; AVX32-NEXT:    retq
401;
402; AVX64-LABEL: test_store_4xf64_aligned:
403; AVX64:       # %bb.0:
404; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
405; AVX64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
406; AVX64-NEXT:    vmovapd %ymm0, (%eax)
407; AVX64-NEXT:    retl
408  %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
409  store <4 x double> %foo, <4 x double>* %addr, align 32
410  ret <4 x double> %foo
411}
412
413define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) {
414; SSE32-LABEL: test_store_16xi32:
415; SSE32:       # %bb.0:
416; SSE32-NEXT:    movups %xmm0, (%rdi)
417; SSE32-NEXT:    movups %xmm1, 16(%rdi)
418; SSE32-NEXT:    movups %xmm2, 32(%rdi)
419; SSE32-NEXT:    movups %xmm3, 48(%rdi)
420; SSE32-NEXT:    retq
421;
422; SSE64-LABEL: test_store_16xi32:
423; SSE64:       # %bb.0:
424; SSE64-NEXT:    subl $12, %esp
425; SSE64-NEXT:    .cfi_def_cfa_offset 16
426; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
427; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
428; SSE64-NEXT:    movups %xmm0, (%eax)
429; SSE64-NEXT:    movups %xmm1, 16(%eax)
430; SSE64-NEXT:    movups %xmm2, 32(%eax)
431; SSE64-NEXT:    movups %xmm3, 48(%eax)
432; SSE64-NEXT:    addl $12, %esp
433; SSE64-NEXT:    .cfi_def_cfa_offset 4
434; SSE64-NEXT:    retl
435;
436; AVXONLY32-LABEL: test_store_16xi32:
437; AVXONLY32:       # %bb.0:
438; AVXONLY32-NEXT:    vmovups %ymm0, (%rdi)
439; AVXONLY32-NEXT:    vmovups %ymm1, 32(%rdi)
440; AVXONLY32-NEXT:    retq
441;
442; AVXONLY64-LABEL: test_store_16xi32:
443; AVXONLY64:       # %bb.0:
444; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
445; AVXONLY64-NEXT:    vmovups %ymm0, (%eax)
446; AVXONLY64-NEXT:    vmovups %ymm1, 32(%eax)
447; AVXONLY64-NEXT:    retl
448;
449; AVX51232-LABEL: test_store_16xi32:
450; AVX51232:       # %bb.0:
451; AVX51232-NEXT:    vmovups %zmm0, (%rdi)
452; AVX51232-NEXT:    retq
453;
454; AVX51264-LABEL: test_store_16xi32:
455; AVX51264:       # %bb.0:
456; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
457; AVX51264-NEXT:    vmovups %zmm0, (%eax)
458; AVX51264-NEXT:    retl
459  store <16 x i32> %value, <16 x i32>* %addr, align 1
460  ret <16 x i32> %value
461}
462
463define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) {
464; SSE32-LABEL: test_store_16xi32_aligned:
465; SSE32:       # %bb.0:
466; SSE32-NEXT:    movaps %xmm0, (%rdi)
467; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
468; SSE32-NEXT:    movaps %xmm2, 32(%rdi)
469; SSE32-NEXT:    movaps %xmm3, 48(%rdi)
470; SSE32-NEXT:    retq
471;
472; SSE64-LABEL: test_store_16xi32_aligned:
473; SSE64:       # %bb.0:
474; SSE64-NEXT:    subl $12, %esp
475; SSE64-NEXT:    .cfi_def_cfa_offset 16
476; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
477; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
478; SSE64-NEXT:    movaps %xmm0, (%eax)
479; SSE64-NEXT:    movaps %xmm1, 16(%eax)
480; SSE64-NEXT:    movaps %xmm2, 32(%eax)
481; SSE64-NEXT:    movaps %xmm3, 48(%eax)
482; SSE64-NEXT:    addl $12, %esp
483; SSE64-NEXT:    .cfi_def_cfa_offset 4
484; SSE64-NEXT:    retl
485;
486; AVXONLY32-LABEL: test_store_16xi32_aligned:
487; AVXONLY32:       # %bb.0:
488; AVXONLY32-NEXT:    vmovaps %ymm0, (%rdi)
489; AVXONLY32-NEXT:    vmovaps %ymm1, 32(%rdi)
490; AVXONLY32-NEXT:    retq
491;
492; AVXONLY64-LABEL: test_store_16xi32_aligned:
493; AVXONLY64:       # %bb.0:
494; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
495; AVXONLY64-NEXT:    vmovaps %ymm0, (%eax)
496; AVXONLY64-NEXT:    vmovaps %ymm1, 32(%eax)
497; AVXONLY64-NEXT:    retl
498;
499; AVX51232-LABEL: test_store_16xi32_aligned:
500; AVX51232:       # %bb.0:
501; AVX51232-NEXT:    vmovaps %zmm0, (%rdi)
502; AVX51232-NEXT:    retq
503;
504; AVX51264-LABEL: test_store_16xi32_aligned:
505; AVX51264:       # %bb.0:
506; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
507; AVX51264-NEXT:    vmovaps %zmm0, (%eax)
508; AVX51264-NEXT:    retl
509  store <16 x i32> %value, <16 x i32>* %addr, align 64
510  ret <16 x i32> %value
511}
512
513define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) {
514; SSE32-LABEL: test_store_16xf32:
515; SSE32:       # %bb.0:
516; SSE32-NEXT:    movups %xmm0, (%rdi)
517; SSE32-NEXT:    movups %xmm1, 16(%rdi)
518; SSE32-NEXT:    movups %xmm2, 32(%rdi)
519; SSE32-NEXT:    movups %xmm3, 48(%rdi)
520; SSE32-NEXT:    retq
521;
522; SSE64-LABEL: test_store_16xf32:
523; SSE64:       # %bb.0:
524; SSE64-NEXT:    subl $12, %esp
525; SSE64-NEXT:    .cfi_def_cfa_offset 16
526; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
527; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
528; SSE64-NEXT:    movups %xmm0, (%eax)
529; SSE64-NEXT:    movups %xmm1, 16(%eax)
530; SSE64-NEXT:    movups %xmm2, 32(%eax)
531; SSE64-NEXT:    movups %xmm3, 48(%eax)
532; SSE64-NEXT:    addl $12, %esp
533; SSE64-NEXT:    .cfi_def_cfa_offset 4
534; SSE64-NEXT:    retl
535;
536; AVXONLY32-LABEL: test_store_16xf32:
537; AVXONLY32:       # %bb.0:
538; AVXONLY32-NEXT:    vmovups %ymm0, (%rdi)
539; AVXONLY32-NEXT:    vmovups %ymm1, 32(%rdi)
540; AVXONLY32-NEXT:    retq
541;
542; AVXONLY64-LABEL: test_store_16xf32:
543; AVXONLY64:       # %bb.0:
544; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
545; AVXONLY64-NEXT:    vmovups %ymm0, (%eax)
546; AVXONLY64-NEXT:    vmovups %ymm1, 32(%eax)
547; AVXONLY64-NEXT:    retl
548;
549; AVX51232-LABEL: test_store_16xf32:
550; AVX51232:       # %bb.0:
551; AVX51232-NEXT:    vmovups %zmm0, (%rdi)
552; AVX51232-NEXT:    retq
553;
554; AVX51264-LABEL: test_store_16xf32:
555; AVX51264:       # %bb.0:
556; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
557; AVX51264-NEXT:    vmovups %zmm0, (%eax)
558; AVX51264-NEXT:    retl
559  store <16 x float> %value, <16 x float>* %addr, align 1
560  ret <16 x float> %value
561}
562
563define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) {
564; SSE32-LABEL: test_store_16xf32_aligned:
565; SSE32:       # %bb.0:
566; SSE32-NEXT:    movaps %xmm0, (%rdi)
567; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
568; SSE32-NEXT:    movaps %xmm2, 32(%rdi)
569; SSE32-NEXT:    movaps %xmm3, 48(%rdi)
570; SSE32-NEXT:    retq
571;
572; SSE64-LABEL: test_store_16xf32_aligned:
573; SSE64:       # %bb.0:
574; SSE64-NEXT:    subl $12, %esp
575; SSE64-NEXT:    .cfi_def_cfa_offset 16
576; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
577; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
578; SSE64-NEXT:    movaps %xmm0, (%eax)
579; SSE64-NEXT:    movaps %xmm1, 16(%eax)
580; SSE64-NEXT:    movaps %xmm2, 32(%eax)
581; SSE64-NEXT:    movaps %xmm3, 48(%eax)
582; SSE64-NEXT:    addl $12, %esp
583; SSE64-NEXT:    .cfi_def_cfa_offset 4
584; SSE64-NEXT:    retl
585;
586; AVXONLY32-LABEL: test_store_16xf32_aligned:
587; AVXONLY32:       # %bb.0:
588; AVXONLY32-NEXT:    vmovaps %ymm0, (%rdi)
589; AVXONLY32-NEXT:    vmovaps %ymm1, 32(%rdi)
590; AVXONLY32-NEXT:    retq
591;
592; AVXONLY64-LABEL: test_store_16xf32_aligned:
593; AVXONLY64:       # %bb.0:
594; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
595; AVXONLY64-NEXT:    vmovaps %ymm0, (%eax)
596; AVXONLY64-NEXT:    vmovaps %ymm1, 32(%eax)
597; AVXONLY64-NEXT:    retl
598;
599; AVX51232-LABEL: test_store_16xf32_aligned:
600; AVX51232:       # %bb.0:
601; AVX51232-NEXT:    vmovaps %zmm0, (%rdi)
602; AVX51232-NEXT:    retq
603;
604; AVX51264-LABEL: test_store_16xf32_aligned:
605; AVX51264:       # %bb.0:
606; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
607; AVX51264-NEXT:    vmovaps %zmm0, (%eax)
608; AVX51264-NEXT:    retl
609  store <16 x float> %value, <16 x float>* %addr, align 64
610  ret <16 x float> %value
611}
612
613define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
614; SSE32-LABEL: test_store_8xf64:
615; SSE32:       # %bb.0:
616; SSE32-NEXT:    addpd %xmm7, %xmm3
617; SSE32-NEXT:    addpd %xmm6, %xmm2
618; SSE32-NEXT:    addpd %xmm5, %xmm1
619; SSE32-NEXT:    addpd %xmm4, %xmm0
620; SSE32-NEXT:    movupd %xmm0, (%rdi)
621; SSE32-NEXT:    movupd %xmm1, 16(%rdi)
622; SSE32-NEXT:    movupd %xmm2, 32(%rdi)
623; SSE32-NEXT:    movupd %xmm3, 48(%rdi)
624; SSE32-NEXT:    retq
625;
626; SSE64-LABEL: test_store_8xf64:
627; SSE64:       # %bb.0:
628; SSE64-NEXT:    subl $12, %esp
629; SSE64-NEXT:    .cfi_def_cfa_offset 16
630; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
631; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
632; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm3
633; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm2
634; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
635; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
636; SSE64-NEXT:    movupd %xmm0, (%eax)
637; SSE64-NEXT:    movupd %xmm1, 16(%eax)
638; SSE64-NEXT:    movupd %xmm2, 32(%eax)
639; SSE64-NEXT:    movupd %xmm3, 48(%eax)
640; SSE64-NEXT:    addl $12, %esp
641; SSE64-NEXT:    .cfi_def_cfa_offset 4
642; SSE64-NEXT:    retl
643;
644; AVXONLY32-LABEL: test_store_8xf64:
645; AVXONLY32:       # %bb.0:
646; AVXONLY32-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
647; AVXONLY32-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
648; AVXONLY32-NEXT:    vmovupd %ymm0, (%rdi)
649; AVXONLY32-NEXT:    vmovupd %ymm1, 32(%rdi)
650; AVXONLY32-NEXT:    retq
651;
652; AVXONLY64-LABEL: test_store_8xf64:
653; AVXONLY64:       # %bb.0:
654; AVXONLY64-NEXT:    pushl %ebp
655; AVXONLY64-NEXT:    .cfi_def_cfa_offset 8
656; AVXONLY64-NEXT:    .cfi_offset %ebp, -8
657; AVXONLY64-NEXT:    movl %esp, %ebp
658; AVXONLY64-NEXT:    .cfi_def_cfa_register %ebp
659; AVXONLY64-NEXT:    andl $-32, %esp
660; AVXONLY64-NEXT:    subl $32, %esp
661; AVXONLY64-NEXT:    movl 8(%ebp), %eax
662; AVXONLY64-NEXT:    vaddpd 40(%ebp), %ymm1, %ymm1
663; AVXONLY64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
664; AVXONLY64-NEXT:    vmovupd %ymm0, (%eax)
665; AVXONLY64-NEXT:    vmovupd %ymm1, 32(%eax)
666; AVXONLY64-NEXT:    movl %ebp, %esp
667; AVXONLY64-NEXT:    popl %ebp
668; AVXONLY64-NEXT:    .cfi_def_cfa %esp, 4
669; AVXONLY64-NEXT:    retl
670;
671; AVX51232-LABEL: test_store_8xf64:
672; AVX51232:       # %bb.0:
673; AVX51232-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
674; AVX51232-NEXT:    vmovupd %zmm0, (%rdi)
675; AVX51232-NEXT:    retq
676;
677; AVX51264-LABEL: test_store_8xf64:
678; AVX51264:       # %bb.0:
679; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
680; AVX51264-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
681; AVX51264-NEXT:    vmovupd %zmm0, (%eax)
682; AVX51264-NEXT:    retl
683  %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
684  store <8 x double> %foo, <8 x double>* %addr, align 1
685  ret <8 x double> %foo
686}
687
688define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
689; SSE32-LABEL: test_store_8xf64_aligned:
690; SSE32:       # %bb.0:
691; SSE32-NEXT:    addpd %xmm7, %xmm3
692; SSE32-NEXT:    addpd %xmm6, %xmm2
693; SSE32-NEXT:    addpd %xmm5, %xmm1
694; SSE32-NEXT:    addpd %xmm4, %xmm0
695; SSE32-NEXT:    movapd %xmm0, (%rdi)
696; SSE32-NEXT:    movapd %xmm1, 16(%rdi)
697; SSE32-NEXT:    movapd %xmm2, 32(%rdi)
698; SSE32-NEXT:    movapd %xmm3, 48(%rdi)
699; SSE32-NEXT:    retq
700;
701; SSE64-LABEL: test_store_8xf64_aligned:
702; SSE64:       # %bb.0:
703; SSE64-NEXT:    subl $12, %esp
704; SSE64-NEXT:    .cfi_def_cfa_offset 16
705; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
706; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
707; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm3
708; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm2
709; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
710; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
711; SSE64-NEXT:    movapd %xmm0, (%eax)
712; SSE64-NEXT:    movapd %xmm1, 16(%eax)
713; SSE64-NEXT:    movapd %xmm2, 32(%eax)
714; SSE64-NEXT:    movapd %xmm3, 48(%eax)
715; SSE64-NEXT:    addl $12, %esp
716; SSE64-NEXT:    .cfi_def_cfa_offset 4
717; SSE64-NEXT:    retl
718;
719; AVXONLY32-LABEL: test_store_8xf64_aligned:
720; AVXONLY32:       # %bb.0:
721; AVXONLY32-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
722; AVXONLY32-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
723; AVXONLY32-NEXT:    vmovapd %ymm0, (%rdi)
724; AVXONLY32-NEXT:    vmovapd %ymm1, 32(%rdi)
725; AVXONLY32-NEXT:    retq
726;
727; AVXONLY64-LABEL: test_store_8xf64_aligned:
728; AVXONLY64:       # %bb.0:
729; AVXONLY64-NEXT:    pushl %ebp
730; AVXONLY64-NEXT:    .cfi_def_cfa_offset 8
731; AVXONLY64-NEXT:    .cfi_offset %ebp, -8
732; AVXONLY64-NEXT:    movl %esp, %ebp
733; AVXONLY64-NEXT:    .cfi_def_cfa_register %ebp
734; AVXONLY64-NEXT:    andl $-32, %esp
735; AVXONLY64-NEXT:    subl $32, %esp
736; AVXONLY64-NEXT:    movl 8(%ebp), %eax
737; AVXONLY64-NEXT:    vaddpd 40(%ebp), %ymm1, %ymm1
738; AVXONLY64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
739; AVXONLY64-NEXT:    vmovapd %ymm0, (%eax)
740; AVXONLY64-NEXT:    vmovapd %ymm1, 32(%eax)
741; AVXONLY64-NEXT:    movl %ebp, %esp
742; AVXONLY64-NEXT:    popl %ebp
743; AVXONLY64-NEXT:    .cfi_def_cfa %esp, 4
744; AVXONLY64-NEXT:    retl
745;
746; AVX51232-LABEL: test_store_8xf64_aligned:
747; AVX51232:       # %bb.0:
748; AVX51232-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
749; AVX51232-NEXT:    vmovapd %zmm0, (%rdi)
750; AVX51232-NEXT:    retq
751;
752; AVX51264-LABEL: test_store_8xf64_aligned:
753; AVX51264:       # %bb.0:
754; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
755; AVX51264-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
756; AVX51264-NEXT:    vmovapd %zmm0, (%eax)
757; AVX51264-NEXT:    retl
758  %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
759  store <8 x double> %foo, <8 x double>* %addr, align 64
760  ret <8 x double> %foo
761}
762