• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=SSE --check-prefix=SSE4A
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=VLX
8
9; Make sure that we generate non-temporal stores for the test cases below.
10; We use xorps for zeroing, so domain information isn't available anymore.
11
12; Scalar versions (zeroing means we can this even for fp types).
13
14define void @test_zero_f32(float* %dst) {
15; SSE-LABEL: test_zero_f32:
16; SSE:       # %bb.0:
17; SSE-NEXT:    xorl %eax, %eax
18; SSE-NEXT:    movntil %eax, (%rdi)
19; SSE-NEXT:    retq
20;
21; AVX-LABEL: test_zero_f32:
22; AVX:       # %bb.0:
23; AVX-NEXT:    xorl %eax, %eax
24; AVX-NEXT:    movntil %eax, (%rdi)
25; AVX-NEXT:    retq
26;
27; VLX-LABEL: test_zero_f32:
28; VLX:       # %bb.0:
29; VLX-NEXT:    xorl %eax, %eax
30; VLX-NEXT:    movntil %eax, (%rdi)
31; VLX-NEXT:    retq
32  store float zeroinitializer, float* %dst, align 1, !nontemporal !1
33  ret void
34}
35
36define void @test_zero_i32(i32* %dst) {
37; SSE-LABEL: test_zero_i32:
38; SSE:       # %bb.0:
39; SSE-NEXT:    xorl %eax, %eax
40; SSE-NEXT:    movntil %eax, (%rdi)
41; SSE-NEXT:    retq
42;
43; AVX-LABEL: test_zero_i32:
44; AVX:       # %bb.0:
45; AVX-NEXT:    xorl %eax, %eax
46; AVX-NEXT:    movntil %eax, (%rdi)
47; AVX-NEXT:    retq
48;
49; VLX-LABEL: test_zero_i32:
50; VLX:       # %bb.0:
51; VLX-NEXT:    xorl %eax, %eax
52; VLX-NEXT:    movntil %eax, (%rdi)
53; VLX-NEXT:    retq
54  store i32 zeroinitializer, i32* %dst, align 1, !nontemporal !1
55  ret void
56}
57
58define void @test_zero_f64(double* %dst) {
59; SSE-LABEL: test_zero_f64:
60; SSE:       # %bb.0:
61; SSE-NEXT:    xorl %eax, %eax
62; SSE-NEXT:    movntiq %rax, (%rdi)
63; SSE-NEXT:    retq
64;
65; AVX-LABEL: test_zero_f64:
66; AVX:       # %bb.0:
67; AVX-NEXT:    xorl %eax, %eax
68; AVX-NEXT:    movntiq %rax, (%rdi)
69; AVX-NEXT:    retq
70;
71; VLX-LABEL: test_zero_f64:
72; VLX:       # %bb.0:
73; VLX-NEXT:    xorl %eax, %eax
74; VLX-NEXT:    movntiq %rax, (%rdi)
75; VLX-NEXT:    retq
76  store double zeroinitializer, double* %dst, align 1, !nontemporal !1
77  ret void
78}
79
80define void @test_zero_i64(i64* %dst) {
81; SSE-LABEL: test_zero_i64:
82; SSE:       # %bb.0:
83; SSE-NEXT:    xorl %eax, %eax
84; SSE-NEXT:    movntiq %rax, (%rdi)
85; SSE-NEXT:    retq
86;
87; AVX-LABEL: test_zero_i64:
88; AVX:       # %bb.0:
89; AVX-NEXT:    xorl %eax, %eax
90; AVX-NEXT:    movntiq %rax, (%rdi)
91; AVX-NEXT:    retq
92;
93; VLX-LABEL: test_zero_i64:
94; VLX:       # %bb.0:
95; VLX-NEXT:    xorl %eax, %eax
96; VLX-NEXT:    movntiq %rax, (%rdi)
97; VLX-NEXT:    retq
98  store i64 zeroinitializer, i64* %dst, align 1, !nontemporal !1
99  ret void
100}
101
102; And now XMM versions.
103
104define void @test_zero_v4f32(<4 x float>* %dst) {
105; SSE-LABEL: test_zero_v4f32:
106; SSE:       # %bb.0:
107; SSE-NEXT:    xorps %xmm0, %xmm0
108; SSE-NEXT:    movntps %xmm0, (%rdi)
109; SSE-NEXT:    retq
110;
111; AVX-LABEL: test_zero_v4f32:
112; AVX:       # %bb.0:
113; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
114; AVX-NEXT:    vmovntps %xmm0, (%rdi)
115; AVX-NEXT:    retq
116;
117; VLX-LABEL: test_zero_v4f32:
118; VLX:       # %bb.0:
119; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
120; VLX-NEXT:    vmovntps %xmm0, (%rdi)
121; VLX-NEXT:    retq
122  store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
123  ret void
124}
125
126define void @test_zero_v4i32(<4 x i32>* %dst) {
127; SSE-LABEL: test_zero_v4i32:
128; SSE:       # %bb.0:
129; SSE-NEXT:    xorps %xmm0, %xmm0
130; SSE-NEXT:    movntps %xmm0, (%rdi)
131; SSE-NEXT:    retq
132;
133; AVX-LABEL: test_zero_v4i32:
134; AVX:       # %bb.0:
135; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
136; AVX-NEXT:    vmovntps %xmm0, (%rdi)
137; AVX-NEXT:    retq
138;
139; VLX-LABEL: test_zero_v4i32:
140; VLX:       # %bb.0:
141; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
142; VLX-NEXT:    vmovntps %xmm0, (%rdi)
143; VLX-NEXT:    retq
144  store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
145  store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
146  ret void
147}
148
149define void @test_zero_v2f64(<2 x double>* %dst) {
150; SSE-LABEL: test_zero_v2f64:
151; SSE:       # %bb.0:
152; SSE-NEXT:    xorps %xmm0, %xmm0
153; SSE-NEXT:    movntps %xmm0, (%rdi)
154; SSE-NEXT:    retq
155;
156; AVX-LABEL: test_zero_v2f64:
157; AVX:       # %bb.0:
158; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
159; AVX-NEXT:    vmovntps %xmm0, (%rdi)
160; AVX-NEXT:    retq
161;
162; VLX-LABEL: test_zero_v2f64:
163; VLX:       # %bb.0:
164; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
165; VLX-NEXT:    vmovntps %xmm0, (%rdi)
166; VLX-NEXT:    retq
167  store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
168  ret void
169}
170
171define void @test_zero_v2i64(<2 x i64>* %dst) {
172; SSE-LABEL: test_zero_v2i64:
173; SSE:       # %bb.0:
174; SSE-NEXT:    xorps %xmm0, %xmm0
175; SSE-NEXT:    movntps %xmm0, (%rdi)
176; SSE-NEXT:    retq
177;
178; AVX-LABEL: test_zero_v2i64:
179; AVX:       # %bb.0:
180; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
181; AVX-NEXT:    vmovntps %xmm0, (%rdi)
182; AVX-NEXT:    retq
183;
184; VLX-LABEL: test_zero_v2i64:
185; VLX:       # %bb.0:
186; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
187; VLX-NEXT:    vmovntps %xmm0, (%rdi)
188; VLX-NEXT:    retq
189  store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1
190  ret void
191}
192
193define void @test_zero_v8i16(<8 x i16>* %dst) {
194; SSE-LABEL: test_zero_v8i16:
195; SSE:       # %bb.0:
196; SSE-NEXT:    xorps %xmm0, %xmm0
197; SSE-NEXT:    movntps %xmm0, (%rdi)
198; SSE-NEXT:    retq
199;
200; AVX-LABEL: test_zero_v8i16:
201; AVX:       # %bb.0:
202; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
203; AVX-NEXT:    vmovntps %xmm0, (%rdi)
204; AVX-NEXT:    retq
205;
206; VLX-LABEL: test_zero_v8i16:
207; VLX:       # %bb.0:
208; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
209; VLX-NEXT:    vmovntps %xmm0, (%rdi)
210; VLX-NEXT:    retq
211  store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1
212  ret void
213}
214
215define void @test_zero_v16i8(<16 x i8>* %dst) {
216; SSE-LABEL: test_zero_v16i8:
217; SSE:       # %bb.0:
218; SSE-NEXT:    xorps %xmm0, %xmm0
219; SSE-NEXT:    movntps %xmm0, (%rdi)
220; SSE-NEXT:    retq
221;
222; AVX-LABEL: test_zero_v16i8:
223; AVX:       # %bb.0:
224; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
225; AVX-NEXT:    vmovntps %xmm0, (%rdi)
226; AVX-NEXT:    retq
227;
228; VLX-LABEL: test_zero_v16i8:
229; VLX:       # %bb.0:
230; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
231; VLX-NEXT:    vmovntps %xmm0, (%rdi)
232; VLX-NEXT:    retq
233  store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1
234  ret void
235}
236
237; And now YMM versions.
238
239define void @test_zero_v8f32(<8 x float>* %dst) {
240; SSE-LABEL: test_zero_v8f32:
241; SSE:       # %bb.0:
242; SSE-NEXT:    xorps %xmm0, %xmm0
243; SSE-NEXT:    movntps %xmm0, 16(%rdi)
244; SSE-NEXT:    movntps %xmm0, (%rdi)
245; SSE-NEXT:    retq
246;
247; AVX-LABEL: test_zero_v8f32:
248; AVX:       # %bb.0:
249; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
250; AVX-NEXT:    vmovntps %ymm0, (%rdi)
251; AVX-NEXT:    vzeroupper
252; AVX-NEXT:    retq
253;
254; VLX-LABEL: test_zero_v8f32:
255; VLX:       # %bb.0:
256; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
257; VLX-NEXT:    vmovntps %ymm0, (%rdi)
258; VLX-NEXT:    vzeroupper
259; VLX-NEXT:    retq
260  store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
261  ret void
262}
263
264define void @test_zero_v8i32(<8 x i32>* %dst) {
265; SSE-LABEL: test_zero_v8i32:
266; SSE:       # %bb.0:
267; SSE-NEXT:    xorps %xmm0, %xmm0
268; SSE-NEXT:    movntps %xmm0, 16(%rdi)
269; SSE-NEXT:    movntps %xmm0, (%rdi)
270; SSE-NEXT:    retq
271;
272; AVX-LABEL: test_zero_v8i32:
273; AVX:       # %bb.0:
274; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
275; AVX-NEXT:    vmovntps %ymm0, (%rdi)
276; AVX-NEXT:    vzeroupper
277; AVX-NEXT:    retq
278;
279; VLX-LABEL: test_zero_v8i32:
280; VLX:       # %bb.0:
281; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
282; VLX-NEXT:    vmovntps %ymm0, (%rdi)
283; VLX-NEXT:    vzeroupper
284; VLX-NEXT:    retq
285  store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
286  ret void
287}
288
289define void @test_zero_v4f64(<4 x double>* %dst) {
290; SSE-LABEL: test_zero_v4f64:
291; SSE:       # %bb.0:
292; SSE-NEXT:    xorps %xmm0, %xmm0
293; SSE-NEXT:    movntps %xmm0, 16(%rdi)
294; SSE-NEXT:    movntps %xmm0, (%rdi)
295; SSE-NEXT:    retq
296;
297; AVX-LABEL: test_zero_v4f64:
298; AVX:       # %bb.0:
299; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
300; AVX-NEXT:    vmovntps %ymm0, (%rdi)
301; AVX-NEXT:    vzeroupper
302; AVX-NEXT:    retq
303;
304; VLX-LABEL: test_zero_v4f64:
305; VLX:       # %bb.0:
306; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
307; VLX-NEXT:    vmovntps %ymm0, (%rdi)
308; VLX-NEXT:    vzeroupper
309; VLX-NEXT:    retq
310  store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
311  ret void
312}
313
314define void @test_zero_v4i64(<4 x i64>* %dst) {
315; SSE-LABEL: test_zero_v4i64:
316; SSE:       # %bb.0:
317; SSE-NEXT:    xorps %xmm0, %xmm0
318; SSE-NEXT:    movntps %xmm0, 16(%rdi)
319; SSE-NEXT:    movntps %xmm0, (%rdi)
320; SSE-NEXT:    retq
321;
322; AVX-LABEL: test_zero_v4i64:
323; AVX:       # %bb.0:
324; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
325; AVX-NEXT:    vmovntps %ymm0, (%rdi)
326; AVX-NEXT:    vzeroupper
327; AVX-NEXT:    retq
328;
329; VLX-LABEL: test_zero_v4i64:
330; VLX:       # %bb.0:
331; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
332; VLX-NEXT:    vmovntps %ymm0, (%rdi)
333; VLX-NEXT:    vzeroupper
334; VLX-NEXT:    retq
335  store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
336  ret void
337}
338
339define void @test_zero_v16i16(<16 x i16>* %dst) {
340; SSE-LABEL: test_zero_v16i16:
341; SSE:       # %bb.0:
342; SSE-NEXT:    xorps %xmm0, %xmm0
343; SSE-NEXT:    movntps %xmm0, 16(%rdi)
344; SSE-NEXT:    movntps %xmm0, (%rdi)
345; SSE-NEXT:    retq
346;
347; AVX-LABEL: test_zero_v16i16:
348; AVX:       # %bb.0:
349; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
350; AVX-NEXT:    vmovntps %ymm0, (%rdi)
351; AVX-NEXT:    vzeroupper
352; AVX-NEXT:    retq
353;
354; VLX-LABEL: test_zero_v16i16:
355; VLX:       # %bb.0:
356; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
357; VLX-NEXT:    vmovntps %ymm0, (%rdi)
358; VLX-NEXT:    vzeroupper
359; VLX-NEXT:    retq
360  store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
361  ret void
362}
363
364define void @test_zero_v32i8(<32 x i8>* %dst) {
365; SSE-LABEL: test_zero_v32i8:
366; SSE:       # %bb.0:
367; SSE-NEXT:    xorps %xmm0, %xmm0
368; SSE-NEXT:    movntps %xmm0, 16(%rdi)
369; SSE-NEXT:    movntps %xmm0, (%rdi)
370; SSE-NEXT:    retq
371;
372; AVX-LABEL: test_zero_v32i8:
373; AVX:       # %bb.0:
374; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
375; AVX-NEXT:    vmovntps %ymm0, (%rdi)
376; AVX-NEXT:    vzeroupper
377; AVX-NEXT:    retq
378;
379; VLX-LABEL: test_zero_v32i8:
380; VLX:       # %bb.0:
381; VLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
382; VLX-NEXT:    vmovntps %ymm0, (%rdi)
383; VLX-NEXT:    vzeroupper
384; VLX-NEXT:    retq
385  store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
386  ret void
387}
388
389
390; Check that we also handle arguments.  Here the type survives longer.
391
392; Scalar versions.
393
394define void @test_arg_f32(float %arg, float* %dst) {
395; SSE2-LABEL: test_arg_f32:
396; SSE2:       # %bb.0:
397; SSE2-NEXT:    movss %xmm0, (%rdi)
398; SSE2-NEXT:    retq
399;
400; SSE4A-LABEL: test_arg_f32:
401; SSE4A:       # %bb.0:
402; SSE4A-NEXT:    movntss %xmm0, (%rdi)
403; SSE4A-NEXT:    retq
404;
405; SSE41-LABEL: test_arg_f32:
406; SSE41:       # %bb.0:
407; SSE41-NEXT:    movss %xmm0, (%rdi)
408; SSE41-NEXT:    retq
409;
410; AVX-LABEL: test_arg_f32:
411; AVX:       # %bb.0:
412; AVX-NEXT:    vmovss %xmm0, (%rdi)
413; AVX-NEXT:    retq
414;
415; VLX-LABEL: test_arg_f32:
416; VLX:       # %bb.0:
417; VLX-NEXT:    vmovss %xmm0, (%rdi)
418; VLX-NEXT:    retq
419  store float %arg, float* %dst, align 1, !nontemporal !1
420  ret void
421}
422
423define void @test_arg_i32(i32 %arg, i32* %dst) {
424; SSE-LABEL: test_arg_i32:
425; SSE:       # %bb.0:
426; SSE-NEXT:    movntil %edi, (%rsi)
427; SSE-NEXT:    retq
428;
429; AVX-LABEL: test_arg_i32:
430; AVX:       # %bb.0:
431; AVX-NEXT:    movntil %edi, (%rsi)
432; AVX-NEXT:    retq
433;
434; VLX-LABEL: test_arg_i32:
435; VLX:       # %bb.0:
436; VLX-NEXT:    movntil %edi, (%rsi)
437; VLX-NEXT:    retq
438  store i32 %arg, i32* %dst, align 1, !nontemporal !1
439  ret void
440}
441
442define void @test_arg_f64(double %arg, double* %dst) {
443; SSE2-LABEL: test_arg_f64:
444; SSE2:       # %bb.0:
445; SSE2-NEXT:    movsd %xmm0, (%rdi)
446; SSE2-NEXT:    retq
447;
448; SSE4A-LABEL: test_arg_f64:
449; SSE4A:       # %bb.0:
450; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
451; SSE4A-NEXT:    retq
452;
453; SSE41-LABEL: test_arg_f64:
454; SSE41:       # %bb.0:
455; SSE41-NEXT:    movsd %xmm0, (%rdi)
456; SSE41-NEXT:    retq
457;
458; AVX-LABEL: test_arg_f64:
459; AVX:       # %bb.0:
460; AVX-NEXT:    vmovsd %xmm0, (%rdi)
461; AVX-NEXT:    retq
462;
463; VLX-LABEL: test_arg_f64:
464; VLX:       # %bb.0:
465; VLX-NEXT:    vmovsd %xmm0, (%rdi)
466; VLX-NEXT:    retq
467  store double %arg, double* %dst, align 1, !nontemporal !1
468  ret void
469}
470
471define void @test_arg_i64(i64 %arg, i64* %dst) {
472; SSE-LABEL: test_arg_i64:
473; SSE:       # %bb.0:
474; SSE-NEXT:    movntiq %rdi, (%rsi)
475; SSE-NEXT:    retq
476;
477; AVX-LABEL: test_arg_i64:
478; AVX:       # %bb.0:
479; AVX-NEXT:    movntiq %rdi, (%rsi)
480; AVX-NEXT:    retq
481;
482; VLX-LABEL: test_arg_i64:
483; VLX:       # %bb.0:
484; VLX-NEXT:    movntiq %rdi, (%rsi)
485; VLX-NEXT:    retq
486  store i64 %arg, i64* %dst, align 1, !nontemporal !1
487  ret void
488}
489
490; Extract versions
491
492define void @test_extract_f32(<4 x float> %arg, float* %dst) {
493; SSE2-LABEL: test_extract_f32:
494; SSE2:       # %bb.0:
495; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
496; SSE2-NEXT:    movss %xmm0, (%rdi)
497; SSE2-NEXT:    retq
498;
499; SSE4A-LABEL: test_extract_f32:
500; SSE4A:       # %bb.0:
501; SSE4A-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
502; SSE4A-NEXT:    movntss %xmm0, (%rdi)
503; SSE4A-NEXT:    retq
504;
505; SSE41-LABEL: test_extract_f32:
506; SSE41:       # %bb.0:
507; SSE41-NEXT:    extractps $1, %xmm0, %eax
508; SSE41-NEXT:    movntil %eax, (%rdi)
509; SSE41-NEXT:    retq
510;
511; AVX-LABEL: test_extract_f32:
512; AVX:       # %bb.0:
513; AVX-NEXT:    vextractps $1, %xmm0, %eax
514; AVX-NEXT:    movntil %eax, (%rdi)
515; AVX-NEXT:    retq
516;
517; VLX-LABEL: test_extract_f32:
518; VLX:       # %bb.0:
519; VLX-NEXT:    vextractps $1, %xmm0, %eax
520; VLX-NEXT:    movntil %eax, (%rdi)
521; VLX-NEXT:    retq
522  %1 = extractelement <4 x float> %arg, i32 1
523  store float %1, float* %dst, align 1, !nontemporal !1
524  ret void
525}
526
527define void @test_extract_i32(<4 x i32> %arg, i32* %dst) {
528; SSE2-LABEL: test_extract_i32:
529; SSE2:       # %bb.0:
530; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
531; SSE2-NEXT:    movd %xmm0, %eax
532; SSE2-NEXT:    movntil %eax, (%rdi)
533; SSE2-NEXT:    retq
534;
535; SSE4A-LABEL: test_extract_i32:
536; SSE4A:       # %bb.0:
537; SSE4A-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
538; SSE4A-NEXT:    movd %xmm0, %eax
539; SSE4A-NEXT:    movntil %eax, (%rdi)
540; SSE4A-NEXT:    retq
541;
542; SSE41-LABEL: test_extract_i32:
543; SSE41:       # %bb.0:
544; SSE41-NEXT:    extractps $1, %xmm0, %eax
545; SSE41-NEXT:    movntil %eax, (%rdi)
546; SSE41-NEXT:    retq
547;
548; AVX-LABEL: test_extract_i32:
549; AVX:       # %bb.0:
550; AVX-NEXT:    vextractps $1, %xmm0, %eax
551; AVX-NEXT:    movntil %eax, (%rdi)
552; AVX-NEXT:    retq
553;
554; VLX-LABEL: test_extract_i32:
555; VLX:       # %bb.0:
556; VLX-NEXT:    vextractps $1, %xmm0, %eax
557; VLX-NEXT:    movntil %eax, (%rdi)
558; VLX-NEXT:    retq
559  %1 = extractelement <4 x i32> %arg, i32 1
560  store i32 %1, i32* %dst, align 1, !nontemporal !1
561  ret void
562}
563
564define void @test_extract_f64(<2 x double> %arg, double* %dst) {
565; SSE2-LABEL: test_extract_f64:
566; SSE2:       # %bb.0:
567; SSE2-NEXT:    movhpd %xmm0, (%rdi)
568; SSE2-NEXT:    retq
569;
570; SSE4A-LABEL: test_extract_f64:
571; SSE4A:       # %bb.0:
572; SSE4A-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
573; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
574; SSE4A-NEXT:    retq
575;
576; SSE41-LABEL: test_extract_f64:
577; SSE41:       # %bb.0:
578; SSE41-NEXT:    movhpd %xmm0, (%rdi)
579; SSE41-NEXT:    retq
580;
581; AVX-LABEL: test_extract_f64:
582; AVX:       # %bb.0:
583; AVX-NEXT:    vmovhpd %xmm0, (%rdi)
584; AVX-NEXT:    retq
585;
586; VLX-LABEL: test_extract_f64:
587; VLX:       # %bb.0:
588; VLX-NEXT:    vmovhpd %xmm0, (%rdi)
589; VLX-NEXT:    retq
590  %1 = extractelement <2 x double> %arg, i32 1
591  store double %1, double* %dst, align 1, !nontemporal !1
592  ret void
593}
594
595define void @test_extract_i64(<2 x i64> %arg, i64* %dst) {
596; SSE2-LABEL: test_extract_i64:
597; SSE2:       # %bb.0:
598; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
599; SSE2-NEXT:    movq %xmm0, %rax
600; SSE2-NEXT:    movntiq %rax, (%rdi)
601; SSE2-NEXT:    retq
602;
603; SSE4A-LABEL: test_extract_i64:
604; SSE4A:       # %bb.0:
605; SSE4A-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
606; SSE4A-NEXT:    movq %xmm0, %rax
607; SSE4A-NEXT:    movntiq %rax, (%rdi)
608; SSE4A-NEXT:    retq
609;
610; SSE41-LABEL: test_extract_i64:
611; SSE41:       # %bb.0:
612; SSE41-NEXT:    pextrq $1, %xmm0, %rax
613; SSE41-NEXT:    movntiq %rax, (%rdi)
614; SSE41-NEXT:    retq
615;
616; AVX-LABEL: test_extract_i64:
617; AVX:       # %bb.0:
618; AVX-NEXT:    vpextrq $1, %xmm0, %rax
619; AVX-NEXT:    movntiq %rax, (%rdi)
620; AVX-NEXT:    retq
621;
622; VLX-LABEL: test_extract_i64:
623; VLX:       # %bb.0:
624; VLX-NEXT:    vpextrq $1, %xmm0, %rax
625; VLX-NEXT:    movntiq %rax, (%rdi)
626; VLX-NEXT:    retq
627  %1 = extractelement <2 x i64> %arg, i32 1
628  store i64 %1, i64* %dst, align 1, !nontemporal !1
629  ret void
630}
631
632; And now XMM versions.
633
634define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
635; SSE-LABEL: test_arg_v4f32:
636; SSE:       # %bb.0:
637; SSE-NEXT:    movntps %xmm0, (%rdi)
638; SSE-NEXT:    retq
639;
640; AVX-LABEL: test_arg_v4f32:
641; AVX:       # %bb.0:
642; AVX-NEXT:    vmovntps %xmm0, (%rdi)
643; AVX-NEXT:    retq
644;
645; VLX-LABEL: test_arg_v4f32:
646; VLX:       # %bb.0:
647; VLX-NEXT:    vmovntps %xmm0, (%rdi)
648; VLX-NEXT:    retq
649  store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1
650  ret void
651}
652
653define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
654; SSE-LABEL: test_arg_v4i32:
655; SSE:       # %bb.0:
656; SSE-NEXT:    movntps %xmm0, (%rdi)
657; SSE-NEXT:    retq
658;
659; AVX-LABEL: test_arg_v4i32:
660; AVX:       # %bb.0:
661; AVX-NEXT:    vmovntps %xmm0, (%rdi)
662; AVX-NEXT:    retq
663;
664; VLX-LABEL: test_arg_v4i32:
665; VLX:       # %bb.0:
666; VLX-NEXT:    vmovntps %xmm0, (%rdi)
667; VLX-NEXT:    retq
668  store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
669  ret void
670}
671
672define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
673; SSE-LABEL: test_arg_v2f64:
674; SSE:       # %bb.0:
675; SSE-NEXT:    movntps %xmm0, (%rdi)
676; SSE-NEXT:    retq
677;
678; AVX-LABEL: test_arg_v2f64:
679; AVX:       # %bb.0:
680; AVX-NEXT:    vmovntps %xmm0, (%rdi)
681; AVX-NEXT:    retq
682;
683; VLX-LABEL: test_arg_v2f64:
684; VLX:       # %bb.0:
685; VLX-NEXT:    vmovntps %xmm0, (%rdi)
686; VLX-NEXT:    retq
687  store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
688  ret void
689}
690
691define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
692; SSE-LABEL: test_arg_v2i64:
693; SSE:       # %bb.0:
694; SSE-NEXT:    movntps %xmm0, (%rdi)
695; SSE-NEXT:    retq
696;
697; AVX-LABEL: test_arg_v2i64:
698; AVX:       # %bb.0:
699; AVX-NEXT:    vmovntps %xmm0, (%rdi)
700; AVX-NEXT:    retq
701;
702; VLX-LABEL: test_arg_v2i64:
703; VLX:       # %bb.0:
704; VLX-NEXT:    vmovntps %xmm0, (%rdi)
705; VLX-NEXT:    retq
706  store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
707  ret void
708}
709
710define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
711; SSE-LABEL: test_arg_v8i16:
712; SSE:       # %bb.0:
713; SSE-NEXT:    movntps %xmm0, (%rdi)
714; SSE-NEXT:    retq
715;
716; AVX-LABEL: test_arg_v8i16:
717; AVX:       # %bb.0:
718; AVX-NEXT:    vmovntps %xmm0, (%rdi)
719; AVX-NEXT:    retq
720;
721; VLX-LABEL: test_arg_v8i16:
722; VLX:       # %bb.0:
723; VLX-NEXT:    vmovntps %xmm0, (%rdi)
724; VLX-NEXT:    retq
725  store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
726  ret void
727}
728
729define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
730; SSE-LABEL: test_arg_v16i8:
731; SSE:       # %bb.0:
732; SSE-NEXT:    movntps %xmm0, (%rdi)
733; SSE-NEXT:    retq
734;
735; AVX-LABEL: test_arg_v16i8:
736; AVX:       # %bb.0:
737; AVX-NEXT:    vmovntps %xmm0, (%rdi)
738; AVX-NEXT:    retq
739;
740; VLX-LABEL: test_arg_v16i8:
741; VLX:       # %bb.0:
742; VLX-NEXT:    vmovntps %xmm0, (%rdi)
743; VLX-NEXT:    retq
744  store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
745  ret void
746}
747
748; And now YMM versions.
749
750define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
751; SSE-LABEL: test_arg_v8f32:
752; SSE:       # %bb.0:
753; SSE-NEXT:    movntps %xmm1, 16(%rdi)
754; SSE-NEXT:    movntps %xmm0, (%rdi)
755; SSE-NEXT:    retq
756;
757; AVX-LABEL: test_arg_v8f32:
758; AVX:       # %bb.0:
759; AVX-NEXT:    vmovntps %ymm0, (%rdi)
760; AVX-NEXT:    vzeroupper
761; AVX-NEXT:    retq
762;
763; VLX-LABEL: test_arg_v8f32:
764; VLX:       # %bb.0:
765; VLX-NEXT:    vmovntps %ymm0, (%rdi)
766; VLX-NEXT:    vzeroupper
767; VLX-NEXT:    retq
768  store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
769  ret void
770}
771
772define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
773; SSE-LABEL: test_arg_v8i32:
774; SSE:       # %bb.0:
775; SSE-NEXT:    movntps %xmm1, 16(%rdi)
776; SSE-NEXT:    movntps %xmm0, (%rdi)
777; SSE-NEXT:    retq
778;
779; AVX-LABEL: test_arg_v8i32:
780; AVX:       # %bb.0:
781; AVX-NEXT:    vmovntps %ymm0, (%rdi)
782; AVX-NEXT:    vzeroupper
783; AVX-NEXT:    retq
784;
785; VLX-LABEL: test_arg_v8i32:
786; VLX:       # %bb.0:
787; VLX-NEXT:    vmovntps %ymm0, (%rdi)
788; VLX-NEXT:    vzeroupper
789; VLX-NEXT:    retq
790  store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
791  ret void
792}
793
794define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
795; SSE-LABEL: test_arg_v4f64:
796; SSE:       # %bb.0:
797; SSE-NEXT:    movntps %xmm1, 16(%rdi)
798; SSE-NEXT:    movntps %xmm0, (%rdi)
799; SSE-NEXT:    retq
800;
801; AVX-LABEL: test_arg_v4f64:
802; AVX:       # %bb.0:
803; AVX-NEXT:    vmovntps %ymm0, (%rdi)
804; AVX-NEXT:    vzeroupper
805; AVX-NEXT:    retq
806;
807; VLX-LABEL: test_arg_v4f64:
808; VLX:       # %bb.0:
809; VLX-NEXT:    vmovntps %ymm0, (%rdi)
810; VLX-NEXT:    vzeroupper
811; VLX-NEXT:    retq
812  store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
813  ret void
814}
815
816define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
817; SSE-LABEL: test_arg_v4i64:
818; SSE:       # %bb.0:
819; SSE-NEXT:    movntps %xmm1, 16(%rdi)
820; SSE-NEXT:    movntps %xmm0, (%rdi)
821; SSE-NEXT:    retq
822;
823; AVX-LABEL: test_arg_v4i64:
824; AVX:       # %bb.0:
825; AVX-NEXT:    vmovntps %ymm0, (%rdi)
826; AVX-NEXT:    vzeroupper
827; AVX-NEXT:    retq
828;
829; VLX-LABEL: test_arg_v4i64:
830; VLX:       # %bb.0:
831; VLX-NEXT:    vmovntps %ymm0, (%rdi)
832; VLX-NEXT:    vzeroupper
833; VLX-NEXT:    retq
834  store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
835  ret void
836}
837
838define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
839; SSE-LABEL: test_arg_v16i16:
840; SSE:       # %bb.0:
841; SSE-NEXT:    movntps %xmm1, 16(%rdi)
842; SSE-NEXT:    movntps %xmm0, (%rdi)
843; SSE-NEXT:    retq
844;
845; AVX-LABEL: test_arg_v16i16:
846; AVX:       # %bb.0:
847; AVX-NEXT:    vmovntps %ymm0, (%rdi)
848; AVX-NEXT:    vzeroupper
849; AVX-NEXT:    retq
850;
851; VLX-LABEL: test_arg_v16i16:
852; VLX:       # %bb.0:
853; VLX-NEXT:    vmovntps %ymm0, (%rdi)
854; VLX-NEXT:    vzeroupper
855; VLX-NEXT:    retq
856  store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
857  ret void
858}
859
860define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
861; SSE-LABEL: test_arg_v32i8:
862; SSE:       # %bb.0:
863; SSE-NEXT:    movntps %xmm1, 16(%rdi)
864; SSE-NEXT:    movntps %xmm0, (%rdi)
865; SSE-NEXT:    retq
866;
867; AVX-LABEL: test_arg_v32i8:
868; AVX:       # %bb.0:
869; AVX-NEXT:    vmovntps %ymm0, (%rdi)
870; AVX-NEXT:    vzeroupper
871; AVX-NEXT:    retq
872;
873; VLX-LABEL: test_arg_v32i8:
874; VLX:       # %bb.0:
875; VLX-NEXT:    vmovntps %ymm0, (%rdi)
876; VLX-NEXT:    vzeroupper
877; VLX-NEXT:    retq
878  store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
879  ret void
880}
881
882
883; Now check that if the execution domain is trivially visible, we use it.
884; We use an add to make the type survive all the way to the MOVNT.
885
886define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
887; SSE-LABEL: test_op_v4f32:
888; SSE:       # %bb.0:
889; SSE-NEXT:    addps %xmm1, %xmm0
890; SSE-NEXT:    movntps %xmm0, (%rdi)
891; SSE-NEXT:    retq
892;
893; AVX-LABEL: test_op_v4f32:
894; AVX:       # %bb.0:
895; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
896; AVX-NEXT:    vmovntps %xmm0, (%rdi)
897; AVX-NEXT:    retq
898;
899; VLX-LABEL: test_op_v4f32:
900; VLX:       # %bb.0:
901; VLX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
902; VLX-NEXT:    vmovntps %xmm0, (%rdi)
903; VLX-NEXT:    retq
904  %r = fadd <4 x float> %a, %b
905  store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1
906  ret void
907}
908
909define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
910; SSE-LABEL: test_op_v4i32:
911; SSE:       # %bb.0:
912; SSE-NEXT:    paddd %xmm1, %xmm0
913; SSE-NEXT:    movntdq %xmm0, (%rdi)
914; SSE-NEXT:    retq
915;
916; AVX-LABEL: test_op_v4i32:
917; AVX:       # %bb.0:
918; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
919; AVX-NEXT:    vmovntdq %xmm0, (%rdi)
920; AVX-NEXT:    retq
921;
922; VLX-LABEL: test_op_v4i32:
923; VLX:       # %bb.0:
924; VLX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
925; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
926; VLX-NEXT:    retq
927  %r = add <4 x i32> %a, %b
928  store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1
929  ret void
930}
931
932define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) {
933; SSE-LABEL: test_op_v2f64:
934; SSE:       # %bb.0:
935; SSE-NEXT:    addpd %xmm1, %xmm0
936; SSE-NEXT:    movntpd %xmm0, (%rdi)
937; SSE-NEXT:    retq
938;
939; AVX-LABEL: test_op_v2f64:
940; AVX:       # %bb.0:
941; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
942; AVX-NEXT:    vmovntpd %xmm0, (%rdi)
943; AVX-NEXT:    retq
944;
945; VLX-LABEL: test_op_v2f64:
946; VLX:       # %bb.0:
947; VLX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
948; VLX-NEXT:    vmovntpd %xmm0, (%rdi)
949; VLX-NEXT:    retq
950  %r = fadd <2 x double> %a, %b
951  store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1
952  ret void
953}
954
955define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
956; SSE-LABEL: test_op_v2i64:
957; SSE:       # %bb.0:
958; SSE-NEXT:    paddq %xmm1, %xmm0
959; SSE-NEXT:    movntdq %xmm0, (%rdi)
960; SSE-NEXT:    retq
961;
962; AVX-LABEL: test_op_v2i64:
963; AVX:       # %bb.0:
964; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
965; AVX-NEXT:    vmovntdq %xmm0, (%rdi)
966; AVX-NEXT:    retq
967;
968; VLX-LABEL: test_op_v2i64:
969; VLX:       # %bb.0:
970; VLX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
971; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
972; VLX-NEXT:    retq
973  %r = add <2 x i64> %a, %b
974  store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1
975  ret void
976}
977
978define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
979; SSE-LABEL: test_op_v8i16:
980; SSE:       # %bb.0:
981; SSE-NEXT:    paddw %xmm1, %xmm0
982; SSE-NEXT:    movntdq %xmm0, (%rdi)
983; SSE-NEXT:    retq
984;
985; AVX-LABEL: test_op_v8i16:
986; AVX:       # %bb.0:
987; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
988; AVX-NEXT:    vmovntdq %xmm0, (%rdi)
989; AVX-NEXT:    retq
990;
991; VLX-LABEL: test_op_v8i16:
992; VLX:       # %bb.0:
993; VLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
994; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
995; VLX-NEXT:    retq
996  %r = add <8 x i16> %a, %b
997  store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1
998  ret void
999}
1000
1001define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
1002; SSE-LABEL: test_op_v16i8:
1003; SSE:       # %bb.0:
1004; SSE-NEXT:    paddb %xmm1, %xmm0
1005; SSE-NEXT:    movntdq %xmm0, (%rdi)
1006; SSE-NEXT:    retq
1007;
1008; AVX-LABEL: test_op_v16i8:
1009; AVX:       # %bb.0:
1010; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1011; AVX-NEXT:    vmovntdq %xmm0, (%rdi)
1012; AVX-NEXT:    retq
1013;
1014; VLX-LABEL: test_op_v16i8:
1015; VLX:       # %bb.0:
1016; VLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1017; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
1018; VLX-NEXT:    retq
1019  %r = add <16 x i8> %a, %b
1020  store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1
1021  ret void
1022}
1023
1024; And now YMM versions.
1025
1026define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
1027; SSE-LABEL: test_op_v8f32:
1028; SSE:       # %bb.0:
1029; SSE-NEXT:    addps %xmm2, %xmm0
1030; SSE-NEXT:    addps %xmm3, %xmm1
1031; SSE-NEXT:    movntps %xmm1, 16(%rdi)
1032; SSE-NEXT:    movntps %xmm0, (%rdi)
1033; SSE-NEXT:    retq
1034;
1035; AVX-LABEL: test_op_v8f32:
1036; AVX:       # %bb.0:
1037; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1038; AVX-NEXT:    vmovntps %ymm0, (%rdi)
1039; AVX-NEXT:    vzeroupper
1040; AVX-NEXT:    retq
1041;
1042; VLX-LABEL: test_op_v8f32:
1043; VLX:       # %bb.0:
1044; VLX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1045; VLX-NEXT:    vmovntps %ymm0, (%rdi)
1046; VLX-NEXT:    vzeroupper
1047; VLX-NEXT:    retq
1048  %r = fadd <8 x float> %a, %b
1049  store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
1050  ret void
1051}
1052
1053define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
1054; SSE-LABEL: test_op_v8i32:
1055; SSE:       # %bb.0:
1056; SSE-NEXT:    paddd %xmm2, %xmm0
1057; SSE-NEXT:    paddd %xmm3, %xmm1
1058; SSE-NEXT:    movntdq %xmm1, 16(%rdi)
1059; SSE-NEXT:    movntdq %xmm0, (%rdi)
1060; SSE-NEXT:    retq
1061;
1062; AVX1-LABEL: test_op_v8i32:
1063; AVX1:       # %bb.0:
1064; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1065; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1066; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
1067; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1068; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1069; AVX1-NEXT:    vmovntps %ymm0, (%rdi)
1070; AVX1-NEXT:    vzeroupper
1071; AVX1-NEXT:    retq
1072;
1073; AVX2-LABEL: test_op_v8i32:
1074; AVX2:       # %bb.0:
1075; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1076; AVX2-NEXT:    vmovntdq %ymm0, (%rdi)
1077; AVX2-NEXT:    vzeroupper
1078; AVX2-NEXT:    retq
1079;
1080; VLX-LABEL: test_op_v8i32:
1081; VLX:       # %bb.0:
1082; VLX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1083; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
1084; VLX-NEXT:    vzeroupper
1085; VLX-NEXT:    retq
1086  %r = add <8 x i32> %a, %b
1087  store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
1088  ret void
1089}
1090
1091define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) {
1092; SSE-LABEL: test_op_v4f64:
1093; SSE:       # %bb.0:
1094; SSE-NEXT:    addpd %xmm2, %xmm0
1095; SSE-NEXT:    addpd %xmm3, %xmm1
1096; SSE-NEXT:    movntpd %xmm1, 16(%rdi)
1097; SSE-NEXT:    movntpd %xmm0, (%rdi)
1098; SSE-NEXT:    retq
1099;
1100; AVX-LABEL: test_op_v4f64:
1101; AVX:       # %bb.0:
1102; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1103; AVX-NEXT:    vmovntpd %ymm0, (%rdi)
1104; AVX-NEXT:    vzeroupper
1105; AVX-NEXT:    retq
1106;
1107; VLX-LABEL: test_op_v4f64:
1108; VLX:       # %bb.0:
1109; VLX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1110; VLX-NEXT:    vmovntpd %ymm0, (%rdi)
1111; VLX-NEXT:    vzeroupper
1112; VLX-NEXT:    retq
1113  %r = fadd <4 x double> %a, %b
1114  store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
1115  ret void
1116}
1117
1118define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
1119; SSE-LABEL: test_op_v4i64:
1120; SSE:       # %bb.0:
1121; SSE-NEXT:    paddq %xmm2, %xmm0
1122; SSE-NEXT:    paddq %xmm3, %xmm1
1123; SSE-NEXT:    movntdq %xmm1, 16(%rdi)
1124; SSE-NEXT:    movntdq %xmm0, (%rdi)
1125; SSE-NEXT:    retq
1126;
1127; AVX1-LABEL: test_op_v4i64:
1128; AVX1:       # %bb.0:
1129; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1130; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1131; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
1132; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1133; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1134; AVX1-NEXT:    vmovntps %ymm0, (%rdi)
1135; AVX1-NEXT:    vzeroupper
1136; AVX1-NEXT:    retq
1137;
1138; AVX2-LABEL: test_op_v4i64:
1139; AVX2:       # %bb.0:
1140; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
1141; AVX2-NEXT:    vmovntdq %ymm0, (%rdi)
1142; AVX2-NEXT:    vzeroupper
1143; AVX2-NEXT:    retq
1144;
1145; VLX-LABEL: test_op_v4i64:
1146; VLX:       # %bb.0:
1147; VLX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
1148; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
1149; VLX-NEXT:    vzeroupper
1150; VLX-NEXT:    retq
1151  %r = add <4 x i64> %a, %b
1152  store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
1153  ret void
1154}
1155
1156define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
1157; SSE-LABEL: test_op_v16i16:
1158; SSE:       # %bb.0:
1159; SSE-NEXT:    paddw %xmm2, %xmm0
1160; SSE-NEXT:    paddw %xmm3, %xmm1
1161; SSE-NEXT:    movntdq %xmm1, 16(%rdi)
1162; SSE-NEXT:    movntdq %xmm0, (%rdi)
1163; SSE-NEXT:    retq
1164;
1165; AVX1-LABEL: test_op_v16i16:
1166; AVX1:       # %bb.0:
1167; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1168; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1169; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
1170; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1171; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1172; AVX1-NEXT:    vmovntps %ymm0, (%rdi)
1173; AVX1-NEXT:    vzeroupper
1174; AVX1-NEXT:    retq
1175;
1176; AVX2-LABEL: test_op_v16i16:
1177; AVX2:       # %bb.0:
1178; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1179; AVX2-NEXT:    vmovntdq %ymm0, (%rdi)
1180; AVX2-NEXT:    vzeroupper
1181; AVX2-NEXT:    retq
1182;
1183; VLX-LABEL: test_op_v16i16:
1184; VLX:       # %bb.0:
1185; VLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1186; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
1187; VLX-NEXT:    vzeroupper
1188; VLX-NEXT:    retq
1189  %r = add <16 x i16> %a, %b
1190  store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
1191  ret void
1192}
1193
1194define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
1195; SSE-LABEL: test_op_v32i8:
1196; SSE:       # %bb.0:
1197; SSE-NEXT:    paddb %xmm2, %xmm0
1198; SSE-NEXT:    paddb %xmm3, %xmm1
1199; SSE-NEXT:    movntdq %xmm1, 16(%rdi)
1200; SSE-NEXT:    movntdq %xmm0, (%rdi)
1201; SSE-NEXT:    retq
1202;
1203; AVX1-LABEL: test_op_v32i8:
1204; AVX1:       # %bb.0:
1205; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1206; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1207; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
1208; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1209; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1210; AVX1-NEXT:    vmovntps %ymm0, (%rdi)
1211; AVX1-NEXT:    vzeroupper
1212; AVX1-NEXT:    retq
1213;
1214; AVX2-LABEL: test_op_v32i8:
1215; AVX2:       # %bb.0:
1216; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1217; AVX2-NEXT:    vmovntdq %ymm0, (%rdi)
1218; AVX2-NEXT:    vzeroupper
1219; AVX2-NEXT:    retq
1220;
1221; VLX-LABEL: test_op_v32i8:
1222; VLX:       # %bb.0:
1223; VLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1224; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
1225; VLX-NEXT:    vzeroupper
1226; VLX-NEXT:    retq
1227  %r = add <32 x i8> %a, %b
1228  store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
1229  ret void
1230}
1231
1232; 256-bit NT stores require 256-bit alignment.
1233; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we
1234; could even scalarize to movnti when we have 1-alignment: nontemporal is
1235; probably always worth even some 20 instruction scalarization.
1236define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
1237; SSE-LABEL: test_unaligned_v8f32:
1238; SSE:       # %bb.0:
1239; SSE-NEXT:    addps %xmm2, %xmm0
1240; SSE-NEXT:    addps %xmm3, %xmm1
1241; SSE-NEXT:    movntps %xmm1, 16(%rdi)
1242; SSE-NEXT:    movntps %xmm0, (%rdi)
1243; SSE-NEXT:    retq
1244;
1245; AVX-LABEL: test_unaligned_v8f32:
1246; AVX:       # %bb.0:
1247; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1248; AVX-NEXT:    vmovups %ymm0, (%rdi)
1249; AVX-NEXT:    vzeroupper
1250; AVX-NEXT:    retq
1251;
1252; VLX-LABEL: test_unaligned_v8f32:
1253; VLX:       # %bb.0:
1254; VLX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1255; VLX-NEXT:    vmovups %ymm0, (%rdi)
1256; VLX-NEXT:    vzeroupper
1257; VLX-NEXT:    retq
1258  %r = fadd <8 x float> %a, %b
1259  store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
1260  ret void
1261}
1262
1263!1 = !{i32 1}
1264