• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL
7
8;
9; Half to Float
10;
11
12define float @cvt_i16_to_f32(i16 %a0) nounwind {
13; ALL-LABEL: cvt_i16_to_f32:
14; ALL:       # %bb.0:
15; ALL-NEXT:    movswl %di, %eax
16; ALL-NEXT:    vmovd %eax, %xmm0
17; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
18; ALL-NEXT:    retq
19  %1 = bitcast i16 %a0 to half
20  %2 = fpext half %1 to float
21  ret float %2
22}
23
24define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
25; AVX1-LABEL: cvt_4i16_to_4f32:
26; AVX1:       # %bb.0:
27; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
28; AVX1-NEXT:    vmovq %xmm0, %rax
29; AVX1-NEXT:    movq %rax, %rcx
30; AVX1-NEXT:    movq %rax, %rdx
31; AVX1-NEXT:    movswl %ax, %esi
32; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
33; AVX1-NEXT:    shrl $16, %eax
34; AVX1-NEXT:    shrq $32, %rcx
35; AVX1-NEXT:    shrq $48, %rdx
36; AVX1-NEXT:    movswl %dx, %edx
37; AVX1-NEXT:    vmovd %edx, %xmm0
38; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
39; AVX1-NEXT:    movswl %cx, %ecx
40; AVX1-NEXT:    vmovd %ecx, %xmm1
41; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
42; AVX1-NEXT:    cwtl
43; AVX1-NEXT:    vmovd %eax, %xmm2
44; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
45; AVX1-NEXT:    vmovd %esi, %xmm3
46; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
47; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
48; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
49; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
50; AVX1-NEXT:    retq
51;
52; AVX2-LABEL: cvt_4i16_to_4f32:
53; AVX2:       # %bb.0:
54; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
55; AVX2-NEXT:    vmovq %xmm0, %rax
56; AVX2-NEXT:    movq %rax, %rcx
57; AVX2-NEXT:    movq %rax, %rdx
58; AVX2-NEXT:    movswl %ax, %esi
59; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
60; AVX2-NEXT:    shrl $16, %eax
61; AVX2-NEXT:    shrq $32, %rcx
62; AVX2-NEXT:    shrq $48, %rdx
63; AVX2-NEXT:    movswl %dx, %edx
64; AVX2-NEXT:    vmovd %edx, %xmm0
65; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
66; AVX2-NEXT:    movswl %cx, %ecx
67; AVX2-NEXT:    vmovd %ecx, %xmm1
68; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
69; AVX2-NEXT:    cwtl
70; AVX2-NEXT:    vmovd %eax, %xmm2
71; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
72; AVX2-NEXT:    vmovd %esi, %xmm3
73; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
74; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
75; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
76; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
77; AVX2-NEXT:    retq
78;
79; AVX512F-LABEL: cvt_4i16_to_4f32:
80; AVX512F:       # %bb.0:
81; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
82; AVX512F-NEXT:    vmovq %xmm0, %rax
83; AVX512F-NEXT:    movq %rax, %rcx
84; AVX512F-NEXT:    movq %rax, %rdx
85; AVX512F-NEXT:    movswl %ax, %esi
86; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
87; AVX512F-NEXT:    shrl $16, %eax
88; AVX512F-NEXT:    shrq $32, %rcx
89; AVX512F-NEXT:    shrq $48, %rdx
90; AVX512F-NEXT:    movswl %dx, %edx
91; AVX512F-NEXT:    vmovd %edx, %xmm0
92; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
93; AVX512F-NEXT:    movswl %cx, %ecx
94; AVX512F-NEXT:    vmovd %ecx, %xmm1
95; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
96; AVX512F-NEXT:    cwtl
97; AVX512F-NEXT:    vmovd %eax, %xmm2
98; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
99; AVX512F-NEXT:    vmovd %esi, %xmm3
100; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
101; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
102; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
103; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
104; AVX512F-NEXT:    retq
105;
106; AVX512VL-LABEL: cvt_4i16_to_4f32:
107; AVX512VL:       # %bb.0:
108; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
109; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
110; AVX512VL-NEXT:    movq %rax, %rcx
111; AVX512VL-NEXT:    movq %rax, %rdx
112; AVX512VL-NEXT:    movswl %ax, %esi
113; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
114; AVX512VL-NEXT:    shrl $16, %eax
115; AVX512VL-NEXT:    shrq $32, %rcx
116; AVX512VL-NEXT:    shrq $48, %rdx
117; AVX512VL-NEXT:    movswl %dx, %edx
118; AVX512VL-NEXT:    vmovd %edx, %xmm0
119; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
120; AVX512VL-NEXT:    movswl %cx, %ecx
121; AVX512VL-NEXT:    vmovd %ecx, %xmm1
122; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
123; AVX512VL-NEXT:    cwtl
124; AVX512VL-NEXT:    vmovd %eax, %xmm2
125; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
126; AVX512VL-NEXT:    vmovd %esi, %xmm3
127; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
128; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
129; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
130; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
131; AVX512VL-NEXT:    retq
132  %1 = bitcast <4 x i16> %a0 to <4 x half>
133  %2 = fpext <4 x half> %1 to <4 x float>
134  ret <4 x float> %2
135}
136
137define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
138; AVX1-LABEL: cvt_8i16_to_4f32:
139; AVX1:       # %bb.0:
140; AVX1-NEXT:    vmovq %xmm0, %rax
141; AVX1-NEXT:    movq %rax, %rcx
142; AVX1-NEXT:    movq %rax, %rdx
143; AVX1-NEXT:    movswl %ax, %esi
144; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
145; AVX1-NEXT:    shrl $16, %eax
146; AVX1-NEXT:    shrq $32, %rcx
147; AVX1-NEXT:    shrq $48, %rdx
148; AVX1-NEXT:    movswl %dx, %edx
149; AVX1-NEXT:    vmovd %edx, %xmm0
150; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
151; AVX1-NEXT:    movswl %cx, %ecx
152; AVX1-NEXT:    vmovd %ecx, %xmm1
153; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
154; AVX1-NEXT:    cwtl
155; AVX1-NEXT:    vmovd %eax, %xmm2
156; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
157; AVX1-NEXT:    vmovd %esi, %xmm3
158; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
159; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
160; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
161; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
162; AVX1-NEXT:    retq
163;
164; AVX2-LABEL: cvt_8i16_to_4f32:
165; AVX2:       # %bb.0:
166; AVX2-NEXT:    vmovq %xmm0, %rax
167; AVX2-NEXT:    movq %rax, %rcx
168; AVX2-NEXT:    movq %rax, %rdx
169; AVX2-NEXT:    movswl %ax, %esi
170; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
171; AVX2-NEXT:    shrl $16, %eax
172; AVX2-NEXT:    shrq $32, %rcx
173; AVX2-NEXT:    shrq $48, %rdx
174; AVX2-NEXT:    movswl %dx, %edx
175; AVX2-NEXT:    vmovd %edx, %xmm0
176; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
177; AVX2-NEXT:    movswl %cx, %ecx
178; AVX2-NEXT:    vmovd %ecx, %xmm1
179; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
180; AVX2-NEXT:    cwtl
181; AVX2-NEXT:    vmovd %eax, %xmm2
182; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
183; AVX2-NEXT:    vmovd %esi, %xmm3
184; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
185; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
186; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
187; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
188; AVX2-NEXT:    retq
189;
190; AVX512F-LABEL: cvt_8i16_to_4f32:
191; AVX512F:       # %bb.0:
192; AVX512F-NEXT:    vmovq %xmm0, %rax
193; AVX512F-NEXT:    movq %rax, %rcx
194; AVX512F-NEXT:    movq %rax, %rdx
195; AVX512F-NEXT:    movswl %ax, %esi
196; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
197; AVX512F-NEXT:    shrl $16, %eax
198; AVX512F-NEXT:    shrq $32, %rcx
199; AVX512F-NEXT:    shrq $48, %rdx
200; AVX512F-NEXT:    movswl %dx, %edx
201; AVX512F-NEXT:    vmovd %edx, %xmm0
202; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
203; AVX512F-NEXT:    movswl %cx, %ecx
204; AVX512F-NEXT:    vmovd %ecx, %xmm1
205; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
206; AVX512F-NEXT:    cwtl
207; AVX512F-NEXT:    vmovd %eax, %xmm2
208; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
209; AVX512F-NEXT:    vmovd %esi, %xmm3
210; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
211; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
212; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
213; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
214; AVX512F-NEXT:    retq
215;
216; AVX512VL-LABEL: cvt_8i16_to_4f32:
217; AVX512VL:       # %bb.0:
218; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
219; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
220; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
221; AVX512VL-NEXT:    movq %rax, %rcx
222; AVX512VL-NEXT:    movq %rax, %rdx
223; AVX512VL-NEXT:    movswl %ax, %esi
224; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
225; AVX512VL-NEXT:    shrl $16, %eax
226; AVX512VL-NEXT:    shrq $32, %rcx
227; AVX512VL-NEXT:    shrq $48, %rdx
228; AVX512VL-NEXT:    movswl %dx, %edx
229; AVX512VL-NEXT:    vmovd %edx, %xmm0
230; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
231; AVX512VL-NEXT:    movswl %cx, %ecx
232; AVX512VL-NEXT:    vmovd %ecx, %xmm1
233; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
234; AVX512VL-NEXT:    cwtl
235; AVX512VL-NEXT:    vmovd %eax, %xmm2
236; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
237; AVX512VL-NEXT:    vmovd %esi, %xmm3
238; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
239; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
240; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
241; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
242; AVX512VL-NEXT:    retq
243  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
244  %2 = bitcast <4 x i16> %1 to <4 x half>
245  %3 = fpext <4 x half> %2 to <4 x float>
246  ret <4 x float> %3
247}
248
249define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
250; ALL-LABEL: cvt_8i16_to_8f32:
251; ALL:       # %bb.0:
252; ALL-NEXT:    vpextrq $1, %xmm0, %rdx
253; ALL-NEXT:    movq %rdx, %r8
254; ALL-NEXT:    movq %rdx, %r10
255; ALL-NEXT:    movswl %dx, %r9d
256; ALL-NEXT:    # kill: def $edx killed $edx killed $rdx
257; ALL-NEXT:    shrl $16, %edx
258; ALL-NEXT:    shrq $32, %r8
259; ALL-NEXT:    shrq $48, %r10
260; ALL-NEXT:    vmovq %xmm0, %rdi
261; ALL-NEXT:    movq %rdi, %rax
262; ALL-NEXT:    movq %rdi, %rsi
263; ALL-NEXT:    movswl %di, %ecx
264; ALL-NEXT:    # kill: def $edi killed $edi killed $rdi
265; ALL-NEXT:    shrl $16, %edi
266; ALL-NEXT:    shrq $32, %rax
267; ALL-NEXT:    shrq $48, %rsi
268; ALL-NEXT:    movswl %si, %esi
269; ALL-NEXT:    vmovd %esi, %xmm0
270; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
271; ALL-NEXT:    cwtl
272; ALL-NEXT:    vmovd %eax, %xmm1
273; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
274; ALL-NEXT:    movswl %di, %eax
275; ALL-NEXT:    vmovd %eax, %xmm2
276; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
277; ALL-NEXT:    vmovd %ecx, %xmm3
278; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
279; ALL-NEXT:    movswl %r10w, %eax
280; ALL-NEXT:    vmovd %eax, %xmm4
281; ALL-NEXT:    vcvtph2ps %xmm4, %xmm4
282; ALL-NEXT:    movswl %r8w, %eax
283; ALL-NEXT:    vmovd %eax, %xmm5
284; ALL-NEXT:    vcvtph2ps %xmm5, %xmm5
285; ALL-NEXT:    movswl %dx, %eax
286; ALL-NEXT:    vmovd %eax, %xmm6
287; ALL-NEXT:    vcvtph2ps %xmm6, %xmm6
288; ALL-NEXT:    vmovd %r9d, %xmm7
289; ALL-NEXT:    vcvtph2ps %xmm7, %xmm7
290; ALL-NEXT:    vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
291; ALL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
292; ALL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
293; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
294; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
295; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
296; ALL-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
297; ALL-NEXT:    retq
298  %1 = bitcast <8 x i16> %a0 to <8 x half>
299  %2 = fpext <8 x half> %1 to <8 x float>
300  ret <8 x float> %2
301}
302
303define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
304; AVX1-LABEL: cvt_16i16_to_16f32:
305; AVX1:       # %bb.0:
306; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
307; AVX1-NEXT:    vmovq %xmm4, %rax
308; AVX1-NEXT:    movq %rax, %rcx
309; AVX1-NEXT:    shrq $48, %rcx
310; AVX1-NEXT:    movswl %cx, %ecx
311; AVX1-NEXT:    vmovd %ecx, %xmm8
312; AVX1-NEXT:    movq %rax, %rcx
313; AVX1-NEXT:    shrq $32, %rcx
314; AVX1-NEXT:    movswl %cx, %ecx
315; AVX1-NEXT:    vmovd %ecx, %xmm9
316; AVX1-NEXT:    movswl %ax, %ecx
317; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
318; AVX1-NEXT:    shrl $16, %eax
319; AVX1-NEXT:    cwtl
320; AVX1-NEXT:    vmovd %eax, %xmm10
321; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
322; AVX1-NEXT:    vmovd %ecx, %xmm11
323; AVX1-NEXT:    movq %rax, %rcx
324; AVX1-NEXT:    shrq $48, %rcx
325; AVX1-NEXT:    movswl %cx, %ecx
326; AVX1-NEXT:    vmovd %ecx, %xmm12
327; AVX1-NEXT:    movq %rax, %rcx
328; AVX1-NEXT:    shrq $32, %rcx
329; AVX1-NEXT:    movswl %cx, %ecx
330; AVX1-NEXT:    vmovd %ecx, %xmm13
331; AVX1-NEXT:    movswl %ax, %ecx
332; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
333; AVX1-NEXT:    shrl $16, %eax
334; AVX1-NEXT:    cwtl
335; AVX1-NEXT:    vmovd %eax, %xmm14
336; AVX1-NEXT:    vmovq %xmm0, %rax
337; AVX1-NEXT:    vmovd %ecx, %xmm15
338; AVX1-NEXT:    movq %rax, %rcx
339; AVX1-NEXT:    shrq $48, %rcx
340; AVX1-NEXT:    movswl %cx, %ecx
341; AVX1-NEXT:    vmovd %ecx, %xmm2
342; AVX1-NEXT:    movq %rax, %rcx
343; AVX1-NEXT:    shrq $32, %rcx
344; AVX1-NEXT:    movswl %cx, %ecx
345; AVX1-NEXT:    vmovd %ecx, %xmm3
346; AVX1-NEXT:    movswl %ax, %ecx
347; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
348; AVX1-NEXT:    shrl $16, %eax
349; AVX1-NEXT:    cwtl
350; AVX1-NEXT:    vmovd %eax, %xmm4
351; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
352; AVX1-NEXT:    vmovd %ecx, %xmm0
353; AVX1-NEXT:    movq %rax, %rcx
354; AVX1-NEXT:    shrq $48, %rcx
355; AVX1-NEXT:    movswl %cx, %ecx
356; AVX1-NEXT:    vmovd %ecx, %xmm5
357; AVX1-NEXT:    movq %rax, %rcx
358; AVX1-NEXT:    shrq $32, %rcx
359; AVX1-NEXT:    movswl %cx, %ecx
360; AVX1-NEXT:    vmovd %ecx, %xmm6
361; AVX1-NEXT:    movl %eax, %ecx
362; AVX1-NEXT:    shrl $16, %ecx
363; AVX1-NEXT:    movswl %cx, %ecx
364; AVX1-NEXT:    vmovd %ecx, %xmm7
365; AVX1-NEXT:    cwtl
366; AVX1-NEXT:    vmovd %eax, %xmm1
367; AVX1-NEXT:    vcvtph2ps %xmm8, %xmm8
368; AVX1-NEXT:    vcvtph2ps %xmm9, %xmm9
369; AVX1-NEXT:    vcvtph2ps %xmm10, %xmm10
370; AVX1-NEXT:    vcvtph2ps %xmm11, %xmm11
371; AVX1-NEXT:    vcvtph2ps %xmm12, %xmm12
372; AVX1-NEXT:    vcvtph2ps %xmm13, %xmm13
373; AVX1-NEXT:    vcvtph2ps %xmm14, %xmm14
374; AVX1-NEXT:    vcvtph2ps %xmm15, %xmm15
375; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
376; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
377; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
378; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
379; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
380; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
381; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
382; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
383; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
384; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
385; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
386; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
387; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
388; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
389; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
390; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
391; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
392; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
393; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
394; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
395; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
396; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
397; AVX1-NEXT:    retq
398;
399; AVX2-LABEL: cvt_16i16_to_16f32:
400; AVX2:       # %bb.0:
401; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
402; AVX2-NEXT:    vmovq %xmm4, %rax
403; AVX2-NEXT:    movq %rax, %rcx
404; AVX2-NEXT:    shrq $48, %rcx
405; AVX2-NEXT:    movswl %cx, %ecx
406; AVX2-NEXT:    vmovd %ecx, %xmm8
407; AVX2-NEXT:    movq %rax, %rcx
408; AVX2-NEXT:    shrq $32, %rcx
409; AVX2-NEXT:    movswl %cx, %ecx
410; AVX2-NEXT:    vmovd %ecx, %xmm9
411; AVX2-NEXT:    movswl %ax, %ecx
412; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
413; AVX2-NEXT:    shrl $16, %eax
414; AVX2-NEXT:    cwtl
415; AVX2-NEXT:    vmovd %eax, %xmm10
416; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
417; AVX2-NEXT:    vmovd %ecx, %xmm11
418; AVX2-NEXT:    movq %rax, %rcx
419; AVX2-NEXT:    shrq $48, %rcx
420; AVX2-NEXT:    movswl %cx, %ecx
421; AVX2-NEXT:    vmovd %ecx, %xmm12
422; AVX2-NEXT:    movq %rax, %rcx
423; AVX2-NEXT:    shrq $32, %rcx
424; AVX2-NEXT:    movswl %cx, %ecx
425; AVX2-NEXT:    vmovd %ecx, %xmm13
426; AVX2-NEXT:    movswl %ax, %ecx
427; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
428; AVX2-NEXT:    shrl $16, %eax
429; AVX2-NEXT:    cwtl
430; AVX2-NEXT:    vmovd %eax, %xmm14
431; AVX2-NEXT:    vmovq %xmm0, %rax
432; AVX2-NEXT:    vmovd %ecx, %xmm15
433; AVX2-NEXT:    movq %rax, %rcx
434; AVX2-NEXT:    shrq $48, %rcx
435; AVX2-NEXT:    movswl %cx, %ecx
436; AVX2-NEXT:    vmovd %ecx, %xmm2
437; AVX2-NEXT:    movq %rax, %rcx
438; AVX2-NEXT:    shrq $32, %rcx
439; AVX2-NEXT:    movswl %cx, %ecx
440; AVX2-NEXT:    vmovd %ecx, %xmm3
441; AVX2-NEXT:    movswl %ax, %ecx
442; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
443; AVX2-NEXT:    shrl $16, %eax
444; AVX2-NEXT:    cwtl
445; AVX2-NEXT:    vmovd %eax, %xmm4
446; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
447; AVX2-NEXT:    vmovd %ecx, %xmm0
448; AVX2-NEXT:    movq %rax, %rcx
449; AVX2-NEXT:    shrq $48, %rcx
450; AVX2-NEXT:    movswl %cx, %ecx
451; AVX2-NEXT:    vmovd %ecx, %xmm5
452; AVX2-NEXT:    movq %rax, %rcx
453; AVX2-NEXT:    shrq $32, %rcx
454; AVX2-NEXT:    movswl %cx, %ecx
455; AVX2-NEXT:    vmovd %ecx, %xmm6
456; AVX2-NEXT:    movl %eax, %ecx
457; AVX2-NEXT:    shrl $16, %ecx
458; AVX2-NEXT:    movswl %cx, %ecx
459; AVX2-NEXT:    vmovd %ecx, %xmm7
460; AVX2-NEXT:    cwtl
461; AVX2-NEXT:    vmovd %eax, %xmm1
462; AVX2-NEXT:    vcvtph2ps %xmm8, %xmm8
463; AVX2-NEXT:    vcvtph2ps %xmm9, %xmm9
464; AVX2-NEXT:    vcvtph2ps %xmm10, %xmm10
465; AVX2-NEXT:    vcvtph2ps %xmm11, %xmm11
466; AVX2-NEXT:    vcvtph2ps %xmm12, %xmm12
467; AVX2-NEXT:    vcvtph2ps %xmm13, %xmm13
468; AVX2-NEXT:    vcvtph2ps %xmm14, %xmm14
469; AVX2-NEXT:    vcvtph2ps %xmm15, %xmm15
470; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
471; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
472; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
473; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
474; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
475; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
476; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
477; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
478; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
479; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
480; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
481; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
482; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
483; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
484; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
485; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
486; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
487; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
488; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
489; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
490; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
491; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
492; AVX2-NEXT:    retq
493;
494; AVX512F-LABEL: cvt_16i16_to_16f32:
495; AVX512F:       # %bb.0:
496; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm10
497; AVX512F-NEXT:    vmovq %xmm0, %rax
498; AVX512F-NEXT:    movq %rax, %rcx
499; AVX512F-NEXT:    shrq $48, %rcx
500; AVX512F-NEXT:    movswl %cx, %ecx
501; AVX512F-NEXT:    vmovd %ecx, %xmm8
502; AVX512F-NEXT:    movq %rax, %rcx
503; AVX512F-NEXT:    shrq $32, %rcx
504; AVX512F-NEXT:    movswl %cx, %ecx
505; AVX512F-NEXT:    vmovd %ecx, %xmm9
506; AVX512F-NEXT:    movswl %ax, %ecx
507; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
508; AVX512F-NEXT:    shrl $16, %eax
509; AVX512F-NEXT:    cwtl
510; AVX512F-NEXT:    vmovd %eax, %xmm11
511; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
512; AVX512F-NEXT:    vmovd %ecx, %xmm12
513; AVX512F-NEXT:    movq %rax, %rcx
514; AVX512F-NEXT:    shrq $48, %rcx
515; AVX512F-NEXT:    movswl %cx, %ecx
516; AVX512F-NEXT:    vmovd %ecx, %xmm13
517; AVX512F-NEXT:    movq %rax, %rcx
518; AVX512F-NEXT:    shrq $32, %rcx
519; AVX512F-NEXT:    movswl %cx, %ecx
520; AVX512F-NEXT:    vmovd %ecx, %xmm14
521; AVX512F-NEXT:    movswl %ax, %ecx
522; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
523; AVX512F-NEXT:    shrl $16, %eax
524; AVX512F-NEXT:    cwtl
525; AVX512F-NEXT:    vmovd %eax, %xmm15
526; AVX512F-NEXT:    vmovq %xmm10, %rax
527; AVX512F-NEXT:    vmovd %ecx, %xmm2
528; AVX512F-NEXT:    movq %rax, %rcx
529; AVX512F-NEXT:    shrq $48, %rcx
530; AVX512F-NEXT:    movswl %cx, %ecx
531; AVX512F-NEXT:    vmovd %ecx, %xmm3
532; AVX512F-NEXT:    movq %rax, %rcx
533; AVX512F-NEXT:    shrq $32, %rcx
534; AVX512F-NEXT:    movswl %cx, %ecx
535; AVX512F-NEXT:    vmovd %ecx, %xmm1
536; AVX512F-NEXT:    movswl %ax, %ecx
537; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
538; AVX512F-NEXT:    shrl $16, %eax
539; AVX512F-NEXT:    cwtl
540; AVX512F-NEXT:    vmovd %eax, %xmm4
541; AVX512F-NEXT:    vpextrq $1, %xmm10, %rax
542; AVX512F-NEXT:    vmovd %ecx, %xmm10
543; AVX512F-NEXT:    movq %rax, %rcx
544; AVX512F-NEXT:    shrq $48, %rcx
545; AVX512F-NEXT:    movswl %cx, %ecx
546; AVX512F-NEXT:    vmovd %ecx, %xmm5
547; AVX512F-NEXT:    movq %rax, %rcx
548; AVX512F-NEXT:    shrq $32, %rcx
549; AVX512F-NEXT:    movswl %cx, %ecx
550; AVX512F-NEXT:    vmovd %ecx, %xmm6
551; AVX512F-NEXT:    movl %eax, %ecx
552; AVX512F-NEXT:    shrl $16, %ecx
553; AVX512F-NEXT:    movswl %cx, %ecx
554; AVX512F-NEXT:    vmovd %ecx, %xmm7
555; AVX512F-NEXT:    cwtl
556; AVX512F-NEXT:    vmovd %eax, %xmm0
557; AVX512F-NEXT:    vcvtph2ps %xmm8, %xmm8
558; AVX512F-NEXT:    vcvtph2ps %xmm9, %xmm9
559; AVX512F-NEXT:    vcvtph2ps %xmm11, %xmm11
560; AVX512F-NEXT:    vcvtph2ps %xmm12, %xmm12
561; AVX512F-NEXT:    vcvtph2ps %xmm13, %xmm13
562; AVX512F-NEXT:    vcvtph2ps %xmm14, %xmm14
563; AVX512F-NEXT:    vcvtph2ps %xmm15, %xmm15
564; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
565; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
566; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
567; AVX512F-NEXT:    vcvtph2ps %xmm4, %xmm4
568; AVX512F-NEXT:    vcvtph2ps %xmm10, %xmm10
569; AVX512F-NEXT:    vcvtph2ps %xmm5, %xmm5
570; AVX512F-NEXT:    vcvtph2ps %xmm6, %xmm6
571; AVX512F-NEXT:    vcvtph2ps %xmm7, %xmm7
572; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
573; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
574; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
575; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
576; AVX512F-NEXT:    vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
577; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
578; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
579; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
580; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
581; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
582; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
583; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
584; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
585; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
586; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
587; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
588; AVX512F-NEXT:    retq
589;
590; AVX512VL-LABEL: cvt_16i16_to_16f32:
591; AVX512VL:       # %bb.0:
592; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm10
593; AVX512VL-NEXT:    vmovq %xmm0, %rax
594; AVX512VL-NEXT:    movq %rax, %rcx
595; AVX512VL-NEXT:    shrq $48, %rcx
596; AVX512VL-NEXT:    movswl %cx, %ecx
597; AVX512VL-NEXT:    vmovd %ecx, %xmm8
598; AVX512VL-NEXT:    movq %rax, %rcx
599; AVX512VL-NEXT:    shrq $32, %rcx
600; AVX512VL-NEXT:    movswl %cx, %ecx
601; AVX512VL-NEXT:    vmovd %ecx, %xmm9
602; AVX512VL-NEXT:    movswl %ax, %ecx
603; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
604; AVX512VL-NEXT:    shrl $16, %eax
605; AVX512VL-NEXT:    cwtl
606; AVX512VL-NEXT:    vmovd %eax, %xmm11
607; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
608; AVX512VL-NEXT:    vmovd %ecx, %xmm12
609; AVX512VL-NEXT:    movq %rax, %rcx
610; AVX512VL-NEXT:    shrq $48, %rcx
611; AVX512VL-NEXT:    movswl %cx, %ecx
612; AVX512VL-NEXT:    vmovd %ecx, %xmm13
613; AVX512VL-NEXT:    movq %rax, %rcx
614; AVX512VL-NEXT:    shrq $32, %rcx
615; AVX512VL-NEXT:    movswl %cx, %ecx
616; AVX512VL-NEXT:    vmovd %ecx, %xmm14
617; AVX512VL-NEXT:    movswl %ax, %ecx
618; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
619; AVX512VL-NEXT:    shrl $16, %eax
620; AVX512VL-NEXT:    cwtl
621; AVX512VL-NEXT:    vmovd %eax, %xmm15
622; AVX512VL-NEXT:    vmovq %xmm10, %rax
623; AVX512VL-NEXT:    vmovd %ecx, %xmm16
624; AVX512VL-NEXT:    movq %rax, %rcx
625; AVX512VL-NEXT:    shrq $48, %rcx
626; AVX512VL-NEXT:    movswl %cx, %ecx
627; AVX512VL-NEXT:    vmovd %ecx, %xmm17
628; AVX512VL-NEXT:    movq %rax, %rcx
629; AVX512VL-NEXT:    shrq $32, %rcx
630; AVX512VL-NEXT:    movswl %cx, %ecx
631; AVX512VL-NEXT:    vmovd %ecx, %xmm18
632; AVX512VL-NEXT:    movswl %ax, %ecx
633; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
634; AVX512VL-NEXT:    shrl $16, %eax
635; AVX512VL-NEXT:    cwtl
636; AVX512VL-NEXT:    vmovd %eax, %xmm19
637; AVX512VL-NEXT:    vpextrq $1, %xmm10, %rax
638; AVX512VL-NEXT:    vmovd %ecx, %xmm10
639; AVX512VL-NEXT:    movq %rax, %rcx
640; AVX512VL-NEXT:    shrq $48, %rcx
641; AVX512VL-NEXT:    movswl %cx, %ecx
642; AVX512VL-NEXT:    vmovd %ecx, %xmm20
643; AVX512VL-NEXT:    movq %rax, %rcx
644; AVX512VL-NEXT:    shrq $32, %rcx
645; AVX512VL-NEXT:    movswl %cx, %ecx
646; AVX512VL-NEXT:    vmovd %ecx, %xmm21
647; AVX512VL-NEXT:    movl %eax, %ecx
648; AVX512VL-NEXT:    shrl $16, %ecx
649; AVX512VL-NEXT:    movswl %cx, %ecx
650; AVX512VL-NEXT:    vmovd %ecx, %xmm22
651; AVX512VL-NEXT:    cwtl
652; AVX512VL-NEXT:    vmovd %eax, %xmm2
653; AVX512VL-NEXT:    vcvtph2ps %xmm8, %xmm8
654; AVX512VL-NEXT:    vcvtph2ps %xmm9, %xmm9
655; AVX512VL-NEXT:    vcvtph2ps %xmm11, %xmm11
656; AVX512VL-NEXT:    vcvtph2ps %xmm12, %xmm12
657; AVX512VL-NEXT:    vcvtph2ps %xmm13, %xmm13
658; AVX512VL-NEXT:    vcvtph2ps %xmm14, %xmm14
659; AVX512VL-NEXT:    vcvtph2ps %xmm15, %xmm15
660; AVX512VL-NEXT:    vcvtph2ps %xmm16, %xmm16
661; AVX512VL-NEXT:    vcvtph2ps %xmm17, %xmm4
662; AVX512VL-NEXT:    vcvtph2ps %xmm18, %xmm0
663; AVX512VL-NEXT:    vcvtph2ps %xmm19, %xmm5
664; AVX512VL-NEXT:    vcvtph2ps %xmm10, %xmm7
665; AVX512VL-NEXT:    vcvtph2ps %xmm20, %xmm3
666; AVX512VL-NEXT:    vcvtph2ps %xmm21, %xmm6
667; AVX512VL-NEXT:    vcvtph2ps %xmm22, %xmm1
668; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
669; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
670; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
671; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
672; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3]
673; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
674; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
675; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
676; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3]
677; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
678; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
679; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
680; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
681; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
682; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
683; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
684; AVX512VL-NEXT:    retq
685  %1 = bitcast <16 x i16> %a0 to <16 x half>
686  %2 = fpext <16 x half> %1 to <16 x float>
687  ret <16 x float> %2
688}
689
690;
691; Half to Float (Load)
692;
693
694define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
695; ALL-LABEL: load_cvt_i16_to_f32:
696; ALL:       # %bb.0:
697; ALL-NEXT:    movswl (%rdi), %eax
698; ALL-NEXT:    vmovd %eax, %xmm0
699; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
700; ALL-NEXT:    retq
701  %1 = load i16, i16* %a0
702  %2 = bitcast i16 %1 to half
703  %3 = fpext half %2 to float
704  ret float %3
705}
706
707define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
708; ALL-LABEL: load_cvt_4i16_to_4f32:
709; ALL:       # %bb.0:
710; ALL-NEXT:    movswl 6(%rdi), %eax
711; ALL-NEXT:    vmovd %eax, %xmm0
712; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
713; ALL-NEXT:    movswl 4(%rdi), %eax
714; ALL-NEXT:    vmovd %eax, %xmm1
715; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
716; ALL-NEXT:    movswl (%rdi), %eax
717; ALL-NEXT:    vmovd %eax, %xmm2
718; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
719; ALL-NEXT:    movswl 2(%rdi), %eax
720; ALL-NEXT:    vmovd %eax, %xmm3
721; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
722; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
723; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
724; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
725; ALL-NEXT:    retq
726  %1 = load <4 x i16>, <4 x i16>* %a0
727  %2 = bitcast <4 x i16> %1 to <4 x half>
728  %3 = fpext <4 x half> %2 to <4 x float>
729  ret <4 x float> %3
730}
731
732define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
733; AVX1-LABEL: load_cvt_8i16_to_4f32:
734; AVX1:       # %bb.0:
735; AVX1-NEXT:    movq (%rdi), %rax
736; AVX1-NEXT:    movq %rax, %rcx
737; AVX1-NEXT:    movq %rax, %rdx
738; AVX1-NEXT:    movswl %ax, %esi
739; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
740; AVX1-NEXT:    shrl $16, %eax
741; AVX1-NEXT:    shrq $32, %rcx
742; AVX1-NEXT:    shrq $48, %rdx
743; AVX1-NEXT:    movswl %dx, %edx
744; AVX1-NEXT:    vmovd %edx, %xmm0
745; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
746; AVX1-NEXT:    movswl %cx, %ecx
747; AVX1-NEXT:    vmovd %ecx, %xmm1
748; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
749; AVX1-NEXT:    cwtl
750; AVX1-NEXT:    vmovd %eax, %xmm2
751; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
752; AVX1-NEXT:    vmovd %esi, %xmm3
753; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
754; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
755; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
756; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
757; AVX1-NEXT:    retq
758;
759; AVX2-LABEL: load_cvt_8i16_to_4f32:
760; AVX2:       # %bb.0:
761; AVX2-NEXT:    movq (%rdi), %rax
762; AVX2-NEXT:    movq %rax, %rcx
763; AVX2-NEXT:    movq %rax, %rdx
764; AVX2-NEXT:    movswl %ax, %esi
765; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
766; AVX2-NEXT:    shrl $16, %eax
767; AVX2-NEXT:    shrq $32, %rcx
768; AVX2-NEXT:    shrq $48, %rdx
769; AVX2-NEXT:    movswl %dx, %edx
770; AVX2-NEXT:    vmovd %edx, %xmm0
771; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
772; AVX2-NEXT:    movswl %cx, %ecx
773; AVX2-NEXT:    vmovd %ecx, %xmm1
774; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
775; AVX2-NEXT:    cwtl
776; AVX2-NEXT:    vmovd %eax, %xmm2
777; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
778; AVX2-NEXT:    vmovd %esi, %xmm3
779; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
780; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
781; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
782; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
783; AVX2-NEXT:    retq
784;
785; AVX512F-LABEL: load_cvt_8i16_to_4f32:
786; AVX512F:       # %bb.0:
787; AVX512F-NEXT:    movq (%rdi), %rax
788; AVX512F-NEXT:    movq %rax, %rcx
789; AVX512F-NEXT:    movq %rax, %rdx
790; AVX512F-NEXT:    movswl %ax, %esi
791; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
792; AVX512F-NEXT:    shrl $16, %eax
793; AVX512F-NEXT:    shrq $32, %rcx
794; AVX512F-NEXT:    shrq $48, %rdx
795; AVX512F-NEXT:    movswl %dx, %edx
796; AVX512F-NEXT:    vmovd %edx, %xmm0
797; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
798; AVX512F-NEXT:    movswl %cx, %ecx
799; AVX512F-NEXT:    vmovd %ecx, %xmm1
800; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
801; AVX512F-NEXT:    cwtl
802; AVX512F-NEXT:    vmovd %eax, %xmm2
803; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
804; AVX512F-NEXT:    vmovd %esi, %xmm3
805; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
806; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
807; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
808; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
809; AVX512F-NEXT:    retq
810;
811; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
812; AVX512VL:       # %bb.0:
813; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
814; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
815; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
816; AVX512VL-NEXT:    movq %rax, %rcx
817; AVX512VL-NEXT:    movq %rax, %rdx
818; AVX512VL-NEXT:    movswl %ax, %esi
819; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
820; AVX512VL-NEXT:    shrl $16, %eax
821; AVX512VL-NEXT:    shrq $32, %rcx
822; AVX512VL-NEXT:    shrq $48, %rdx
823; AVX512VL-NEXT:    movswl %dx, %edx
824; AVX512VL-NEXT:    vmovd %edx, %xmm0
825; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
826; AVX512VL-NEXT:    movswl %cx, %ecx
827; AVX512VL-NEXT:    vmovd %ecx, %xmm1
828; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
829; AVX512VL-NEXT:    cwtl
830; AVX512VL-NEXT:    vmovd %eax, %xmm2
831; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
832; AVX512VL-NEXT:    vmovd %esi, %xmm3
833; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
834; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
835; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
836; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
837; AVX512VL-NEXT:    retq
838  %1 = load <8 x i16>, <8 x i16>* %a0
839  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
840  %3 = bitcast <4 x i16> %2 to <4 x half>
841  %4 = fpext <4 x half> %3 to <4 x float>
842  ret <4 x float> %4
843}
844
845define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
846; ALL-LABEL: load_cvt_8i16_to_8f32:
847; ALL:       # %bb.0:
848; ALL-NEXT:    movswl 6(%rdi), %eax
849; ALL-NEXT:    vmovd %eax, %xmm0
850; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
851; ALL-NEXT:    movswl 4(%rdi), %eax
852; ALL-NEXT:    vmovd %eax, %xmm1
853; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
854; ALL-NEXT:    movswl (%rdi), %eax
855; ALL-NEXT:    vmovd %eax, %xmm2
856; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
857; ALL-NEXT:    movswl 2(%rdi), %eax
858; ALL-NEXT:    vmovd %eax, %xmm3
859; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
860; ALL-NEXT:    movswl 14(%rdi), %eax
861; ALL-NEXT:    vmovd %eax, %xmm4
862; ALL-NEXT:    vcvtph2ps %xmm4, %xmm4
863; ALL-NEXT:    movswl 12(%rdi), %eax
864; ALL-NEXT:    vmovd %eax, %xmm5
865; ALL-NEXT:    vcvtph2ps %xmm5, %xmm5
866; ALL-NEXT:    movswl 8(%rdi), %eax
867; ALL-NEXT:    vmovd %eax, %xmm6
868; ALL-NEXT:    vcvtph2ps %xmm6, %xmm6
869; ALL-NEXT:    movswl 10(%rdi), %eax
870; ALL-NEXT:    vmovd %eax, %xmm7
871; ALL-NEXT:    vcvtph2ps %xmm7, %xmm7
872; ALL-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
873; ALL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
874; ALL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
875; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
876; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
877; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
878; ALL-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
879; ALL-NEXT:    retq
880  %1 = load <8 x i16>, <8 x i16>* %a0
881  %2 = bitcast <8 x i16> %1 to <8 x half>
882  %3 = fpext <8 x half> %2 to <8 x float>
883  ret <8 x float> %3
884}
885
886define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
887; AVX1-LABEL: load_cvt_16i16_to_16f32:
888; AVX1:       # %bb.0:
889; AVX1-NEXT:    movswl 22(%rdi), %eax
890; AVX1-NEXT:    vmovd %eax, %xmm0
891; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm8
892; AVX1-NEXT:    movswl 20(%rdi), %eax
893; AVX1-NEXT:    vmovd %eax, %xmm0
894; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm9
895; AVX1-NEXT:    movswl 16(%rdi), %eax
896; AVX1-NEXT:    vmovd %eax, %xmm0
897; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm10
898; AVX1-NEXT:    movswl 18(%rdi), %eax
899; AVX1-NEXT:    vmovd %eax, %xmm0
900; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm11
901; AVX1-NEXT:    movswl 30(%rdi), %eax
902; AVX1-NEXT:    vmovd %eax, %xmm0
903; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm12
904; AVX1-NEXT:    movswl 28(%rdi), %eax
905; AVX1-NEXT:    vmovd %eax, %xmm0
906; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm13
907; AVX1-NEXT:    movswl 24(%rdi), %eax
908; AVX1-NEXT:    vmovd %eax, %xmm0
909; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm14
910; AVX1-NEXT:    movswl 26(%rdi), %eax
911; AVX1-NEXT:    vmovd %eax, %xmm0
912; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm15
913; AVX1-NEXT:    movswl 6(%rdi), %eax
914; AVX1-NEXT:    vmovd %eax, %xmm0
915; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
916; AVX1-NEXT:    movswl 4(%rdi), %eax
917; AVX1-NEXT:    vmovd %eax, %xmm2
918; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
919; AVX1-NEXT:    movswl (%rdi), %eax
920; AVX1-NEXT:    vmovd %eax, %xmm3
921; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
922; AVX1-NEXT:    movswl 2(%rdi), %eax
923; AVX1-NEXT:    vmovd %eax, %xmm4
924; AVX1-NEXT:    vcvtph2ps %xmm4, %xmm4
925; AVX1-NEXT:    movswl 14(%rdi), %eax
926; AVX1-NEXT:    vmovd %eax, %xmm5
927; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
928; AVX1-NEXT:    movswl 12(%rdi), %eax
929; AVX1-NEXT:    vmovd %eax, %xmm6
930; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
931; AVX1-NEXT:    movswl 8(%rdi), %eax
932; AVX1-NEXT:    vmovd %eax, %xmm7
933; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
934; AVX1-NEXT:    movswl 10(%rdi), %eax
935; AVX1-NEXT:    vmovd %eax, %xmm1
936; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
937; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
938; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
939; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
940; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
941; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
942; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
943; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
944; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
945; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
946; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
947; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
948; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
949; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
950; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
951; AVX1-NEXT:    retq
952;
953; AVX2-LABEL: load_cvt_16i16_to_16f32:
954; AVX2:       # %bb.0:
955; AVX2-NEXT:    movswl 22(%rdi), %eax
956; AVX2-NEXT:    vmovd %eax, %xmm0
957; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm8
958; AVX2-NEXT:    movswl 20(%rdi), %eax
959; AVX2-NEXT:    vmovd %eax, %xmm0
960; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm9
961; AVX2-NEXT:    movswl 16(%rdi), %eax
962; AVX2-NEXT:    vmovd %eax, %xmm0
963; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm10
964; AVX2-NEXT:    movswl 18(%rdi), %eax
965; AVX2-NEXT:    vmovd %eax, %xmm0
966; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm11
967; AVX2-NEXT:    movswl 30(%rdi), %eax
968; AVX2-NEXT:    vmovd %eax, %xmm0
969; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm12
970; AVX2-NEXT:    movswl 28(%rdi), %eax
971; AVX2-NEXT:    vmovd %eax, %xmm0
972; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm13
973; AVX2-NEXT:    movswl 24(%rdi), %eax
974; AVX2-NEXT:    vmovd %eax, %xmm0
975; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm14
976; AVX2-NEXT:    movswl 26(%rdi), %eax
977; AVX2-NEXT:    vmovd %eax, %xmm0
978; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm15
979; AVX2-NEXT:    movswl 6(%rdi), %eax
980; AVX2-NEXT:    vmovd %eax, %xmm0
981; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
982; AVX2-NEXT:    movswl 4(%rdi), %eax
983; AVX2-NEXT:    vmovd %eax, %xmm2
984; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
985; AVX2-NEXT:    movswl (%rdi), %eax
986; AVX2-NEXT:    vmovd %eax, %xmm3
987; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
988; AVX2-NEXT:    movswl 2(%rdi), %eax
989; AVX2-NEXT:    vmovd %eax, %xmm4
990; AVX2-NEXT:    vcvtph2ps %xmm4, %xmm4
991; AVX2-NEXT:    movswl 14(%rdi), %eax
992; AVX2-NEXT:    vmovd %eax, %xmm5
993; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
994; AVX2-NEXT:    movswl 12(%rdi), %eax
995; AVX2-NEXT:    vmovd %eax, %xmm6
996; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
997; AVX2-NEXT:    movswl 8(%rdi), %eax
998; AVX2-NEXT:    vmovd %eax, %xmm7
999; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
1000; AVX2-NEXT:    movswl 10(%rdi), %eax
1001; AVX2-NEXT:    vmovd %eax, %xmm1
1002; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1003; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
1004; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
1005; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
1006; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
1007; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
1008; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
1009; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1010; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
1011; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
1012; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
1013; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
1014; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
1015; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
1016; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1017; AVX2-NEXT:    retq
1018;
1019; AVX512F-LABEL: load_cvt_16i16_to_16f32:
1020; AVX512F:       # %bb.0:
1021; AVX512F-NEXT:    movswl 6(%rdi), %eax
1022; AVX512F-NEXT:    vmovd %eax, %xmm0
1023; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm8
1024; AVX512F-NEXT:    movswl 4(%rdi), %eax
1025; AVX512F-NEXT:    vmovd %eax, %xmm0
1026; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm9
1027; AVX512F-NEXT:    movswl (%rdi), %eax
1028; AVX512F-NEXT:    vmovd %eax, %xmm0
1029; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm10
1030; AVX512F-NEXT:    movswl 2(%rdi), %eax
1031; AVX512F-NEXT:    vmovd %eax, %xmm0
1032; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm11
1033; AVX512F-NEXT:    movswl 14(%rdi), %eax
1034; AVX512F-NEXT:    vmovd %eax, %xmm0
1035; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm12
1036; AVX512F-NEXT:    movswl 12(%rdi), %eax
1037; AVX512F-NEXT:    vmovd %eax, %xmm0
1038; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm13
1039; AVX512F-NEXT:    movswl 8(%rdi), %eax
1040; AVX512F-NEXT:    vmovd %eax, %xmm0
1041; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm14
1042; AVX512F-NEXT:    movswl 10(%rdi), %eax
1043; AVX512F-NEXT:    vmovd %eax, %xmm0
1044; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm15
1045; AVX512F-NEXT:    movswl 22(%rdi), %eax
1046; AVX512F-NEXT:    vmovd %eax, %xmm0
1047; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
1048; AVX512F-NEXT:    movswl 20(%rdi), %eax
1049; AVX512F-NEXT:    vmovd %eax, %xmm1
1050; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
1051; AVX512F-NEXT:    movswl 16(%rdi), %eax
1052; AVX512F-NEXT:    vmovd %eax, %xmm2
1053; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
1054; AVX512F-NEXT:    movswl 18(%rdi), %eax
1055; AVX512F-NEXT:    vmovd %eax, %xmm3
1056; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
1057; AVX512F-NEXT:    movswl 30(%rdi), %eax
1058; AVX512F-NEXT:    vmovd %eax, %xmm4
1059; AVX512F-NEXT:    vcvtph2ps %xmm4, %xmm4
1060; AVX512F-NEXT:    movswl 28(%rdi), %eax
1061; AVX512F-NEXT:    vmovd %eax, %xmm5
1062; AVX512F-NEXT:    vcvtph2ps %xmm5, %xmm5
1063; AVX512F-NEXT:    movswl 24(%rdi), %eax
1064; AVX512F-NEXT:    vmovd %eax, %xmm6
1065; AVX512F-NEXT:    vcvtph2ps %xmm6, %xmm6
1066; AVX512F-NEXT:    movswl 26(%rdi), %eax
1067; AVX512F-NEXT:    vmovd %eax, %xmm7
1068; AVX512F-NEXT:    vcvtph2ps %xmm7, %xmm7
1069; AVX512F-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1070; AVX512F-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1071; AVX512F-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1072; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1073; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1074; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1075; AVX512F-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
1076; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
1077; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
1078; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
1079; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
1080; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
1081; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
1082; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1083; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
1084; AVX512F-NEXT:    retq
1085;
1086; AVX512VL-LABEL: load_cvt_16i16_to_16f32:
1087; AVX512VL:       # %bb.0:
1088; AVX512VL-NEXT:    movswl 6(%rdi), %eax
1089; AVX512VL-NEXT:    vmovd %eax, %xmm0
1090; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm8
1091; AVX512VL-NEXT:    movswl 4(%rdi), %eax
1092; AVX512VL-NEXT:    vmovd %eax, %xmm1
1093; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm9
1094; AVX512VL-NEXT:    movswl (%rdi), %eax
1095; AVX512VL-NEXT:    vmovd %eax, %xmm2
1096; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm10
1097; AVX512VL-NEXT:    movswl 2(%rdi), %eax
1098; AVX512VL-NEXT:    vmovd %eax, %xmm3
1099; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm11
1100; AVX512VL-NEXT:    movswl 14(%rdi), %eax
1101; AVX512VL-NEXT:    vmovd %eax, %xmm4
1102; AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm12
1103; AVX512VL-NEXT:    movswl 12(%rdi), %eax
1104; AVX512VL-NEXT:    vmovd %eax, %xmm5
1105; AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm13
1106; AVX512VL-NEXT:    movswl 8(%rdi), %eax
1107; AVX512VL-NEXT:    vmovd %eax, %xmm6
1108; AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm14
1109; AVX512VL-NEXT:    movswl 10(%rdi), %eax
1110; AVX512VL-NEXT:    vmovd %eax, %xmm7
1111; AVX512VL-NEXT:    vcvtph2ps %xmm7, %xmm15
1112; AVX512VL-NEXT:    movswl 22(%rdi), %eax
1113; AVX512VL-NEXT:    vmovd %eax, %xmm0
1114; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1115; AVX512VL-NEXT:    movswl 20(%rdi), %eax
1116; AVX512VL-NEXT:    vmovd %eax, %xmm1
1117; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1118; AVX512VL-NEXT:    movswl 16(%rdi), %eax
1119; AVX512VL-NEXT:    vmovd %eax, %xmm2
1120; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
1121; AVX512VL-NEXT:    movswl 18(%rdi), %eax
1122; AVX512VL-NEXT:    vmovd %eax, %xmm3
1123; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
1124; AVX512VL-NEXT:    movswl 30(%rdi), %eax
1125; AVX512VL-NEXT:    vmovd %eax, %xmm4
1126; AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
1127; AVX512VL-NEXT:    movswl 28(%rdi), %eax
1128; AVX512VL-NEXT:    vmovd %eax, %xmm5
1129; AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
1130; AVX512VL-NEXT:    movswl 24(%rdi), %eax
1131; AVX512VL-NEXT:    vmovd %eax, %xmm6
1132; AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
1133; AVX512VL-NEXT:    movswl 26(%rdi), %eax
1134; AVX512VL-NEXT:    vmovd %eax, %xmm7
1135; AVX512VL-NEXT:    vcvtph2ps %xmm7, %xmm7
1136; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1137; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1138; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1139; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1140; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1141; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1142; AVX512VL-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
1143; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
1144; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
1145; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
1146; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
1147; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
1148; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
1149; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1150; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
1151; AVX512VL-NEXT:    retq
1152  %1 = load <16 x i16>, <16 x i16>* %a0
1153  %2 = bitcast <16 x i16> %1 to <16 x half>
1154  %3 = fpext <16 x half> %2 to <16 x float>
1155  ret <16 x float> %3
1156}
1157
1158;
1159; Half to Double
1160;
1161
1162define double @cvt_i16_to_f64(i16 %a0) nounwind {
1163; ALL-LABEL: cvt_i16_to_f64:
1164; ALL:       # %bb.0:
1165; ALL-NEXT:    movswl %di, %eax
1166; ALL-NEXT:    vmovd %eax, %xmm0
1167; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
1168; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1169; ALL-NEXT:    retq
1170  %1 = bitcast i16 %a0 to half
1171  %2 = fpext half %1 to double
1172  ret double %2
1173}
1174
1175define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
1176; AVX1-LABEL: cvt_2i16_to_2f64:
1177; AVX1:       # %bb.0:
1178; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1179; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1180; AVX1-NEXT:    vmovd %xmm0, %eax
1181; AVX1-NEXT:    movswl %ax, %ecx
1182; AVX1-NEXT:    shrl $16, %eax
1183; AVX1-NEXT:    cwtl
1184; AVX1-NEXT:    vmovd %eax, %xmm0
1185; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1186; AVX1-NEXT:    vmovd %ecx, %xmm1
1187; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1188; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1189; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1190; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1191; AVX1-NEXT:    retq
1192;
1193; AVX2-SLOW-LABEL: cvt_2i16_to_2f64:
1194; AVX2-SLOW:       # %bb.0:
1195; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1196; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1197; AVX2-SLOW-NEXT:    vmovd %xmm0, %eax
1198; AVX2-SLOW-NEXT:    movswl %ax, %ecx
1199; AVX2-SLOW-NEXT:    shrl $16, %eax
1200; AVX2-SLOW-NEXT:    cwtl
1201; AVX2-SLOW-NEXT:    vmovd %eax, %xmm0
1202; AVX2-SLOW-NEXT:    vcvtph2ps %xmm0, %xmm0
1203; AVX2-SLOW-NEXT:    vmovd %ecx, %xmm1
1204; AVX2-SLOW-NEXT:    vcvtph2ps %xmm1, %xmm1
1205; AVX2-SLOW-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1206; AVX2-SLOW-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1207; AVX2-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1208; AVX2-SLOW-NEXT:    retq
1209;
1210; AVX2-FAST-LABEL: cvt_2i16_to_2f64:
1211; AVX2-FAST:       # %bb.0:
1212; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
1213; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
1214; AVX2-FAST-NEXT:    movswl %ax, %ecx
1215; AVX2-FAST-NEXT:    shrl $16, %eax
1216; AVX2-FAST-NEXT:    cwtl
1217; AVX2-FAST-NEXT:    vmovd %eax, %xmm0
1218; AVX2-FAST-NEXT:    vcvtph2ps %xmm0, %xmm0
1219; AVX2-FAST-NEXT:    vmovd %ecx, %xmm1
1220; AVX2-FAST-NEXT:    vcvtph2ps %xmm1, %xmm1
1221; AVX2-FAST-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1222; AVX2-FAST-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1223; AVX2-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1224; AVX2-FAST-NEXT:    retq
1225;
1226; AVX512F-LABEL: cvt_2i16_to_2f64:
1227; AVX512F:       # %bb.0:
1228; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1229; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1230; AVX512F-NEXT:    vmovd %xmm0, %eax
1231; AVX512F-NEXT:    movswl %ax, %ecx
1232; AVX512F-NEXT:    shrl $16, %eax
1233; AVX512F-NEXT:    cwtl
1234; AVX512F-NEXT:    vmovd %eax, %xmm0
1235; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
1236; AVX512F-NEXT:    vmovd %ecx, %xmm1
1237; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
1238; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1239; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1240; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1241; AVX512F-NEXT:    retq
1242;
1243; AVX512VL-LABEL: cvt_2i16_to_2f64:
1244; AVX512VL:       # %bb.0:
1245; AVX512VL-NEXT:    vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
1246; AVX512VL-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
1247; AVX512VL-NEXT:    movswl %ax, %ecx
1248; AVX512VL-NEXT:    shrl $16, %eax
1249; AVX512VL-NEXT:    cwtl
1250; AVX512VL-NEXT:    vmovd %eax, %xmm0
1251; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1252; AVX512VL-NEXT:    vmovd %ecx, %xmm1
1253; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1254; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1255; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1256; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1257; AVX512VL-NEXT:    retq
1258  %1 = bitcast <2 x i16> %a0 to <2 x half>
1259  %2 = fpext <2 x half> %1 to <2 x double>
1260  ret <2 x double> %2
1261}
1262
1263define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
1264; AVX1-LABEL: cvt_4i16_to_4f64:
1265; AVX1:       # %bb.0:
1266; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1267; AVX1-NEXT:    vmovq %xmm0, %rax
1268; AVX1-NEXT:    movq %rax, %rcx
1269; AVX1-NEXT:    movl %eax, %edx
1270; AVX1-NEXT:    movswl %ax, %esi
1271; AVX1-NEXT:    shrq $48, %rax
1272; AVX1-NEXT:    shrq $32, %rcx
1273; AVX1-NEXT:    shrl $16, %edx
1274; AVX1-NEXT:    movswl %dx, %edx
1275; AVX1-NEXT:    vmovd %edx, %xmm0
1276; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1277; AVX1-NEXT:    vmovd %esi, %xmm1
1278; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1279; AVX1-NEXT:    movswl %cx, %ecx
1280; AVX1-NEXT:    vmovd %ecx, %xmm2
1281; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
1282; AVX1-NEXT:    cwtl
1283; AVX1-NEXT:    vmovd %eax, %xmm3
1284; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
1285; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1286; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1287; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1288; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1289; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1290; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1291; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1292; AVX1-NEXT:    retq
1293;
1294; AVX2-LABEL: cvt_4i16_to_4f64:
1295; AVX2:       # %bb.0:
1296; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1297; AVX2-NEXT:    vmovq %xmm0, %rax
1298; AVX2-NEXT:    movq %rax, %rcx
1299; AVX2-NEXT:    movl %eax, %edx
1300; AVX2-NEXT:    movswl %ax, %esi
1301; AVX2-NEXT:    shrq $48, %rax
1302; AVX2-NEXT:    shrq $32, %rcx
1303; AVX2-NEXT:    shrl $16, %edx
1304; AVX2-NEXT:    movswl %dx, %edx
1305; AVX2-NEXT:    vmovd %edx, %xmm0
1306; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1307; AVX2-NEXT:    vmovd %esi, %xmm1
1308; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1309; AVX2-NEXT:    movswl %cx, %ecx
1310; AVX2-NEXT:    vmovd %ecx, %xmm2
1311; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
1312; AVX2-NEXT:    cwtl
1313; AVX2-NEXT:    vmovd %eax, %xmm3
1314; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
1315; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1316; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1317; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1318; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1319; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1320; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1321; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1322; AVX2-NEXT:    retq
1323;
1324; AVX512F-LABEL: cvt_4i16_to_4f64:
1325; AVX512F:       # %bb.0:
1326; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1327; AVX512F-NEXT:    vmovq %xmm0, %rax
1328; AVX512F-NEXT:    movq %rax, %rcx
1329; AVX512F-NEXT:    movl %eax, %edx
1330; AVX512F-NEXT:    movswl %ax, %esi
1331; AVX512F-NEXT:    shrq $48, %rax
1332; AVX512F-NEXT:    shrq $32, %rcx
1333; AVX512F-NEXT:    shrl $16, %edx
1334; AVX512F-NEXT:    movswl %dx, %edx
1335; AVX512F-NEXT:    vmovd %edx, %xmm0
1336; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
1337; AVX512F-NEXT:    vmovd %esi, %xmm1
1338; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
1339; AVX512F-NEXT:    movswl %cx, %ecx
1340; AVX512F-NEXT:    vmovd %ecx, %xmm2
1341; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
1342; AVX512F-NEXT:    cwtl
1343; AVX512F-NEXT:    vmovd %eax, %xmm3
1344; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
1345; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1346; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1347; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1348; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1349; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1350; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1351; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1352; AVX512F-NEXT:    retq
1353;
1354; AVX512VL-LABEL: cvt_4i16_to_4f64:
1355; AVX512VL:       # %bb.0:
1356; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
1357; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1358; AVX512VL-NEXT:    movq %rax, %rcx
1359; AVX512VL-NEXT:    movl %eax, %edx
1360; AVX512VL-NEXT:    movswl %ax, %esi
1361; AVX512VL-NEXT:    shrq $48, %rax
1362; AVX512VL-NEXT:    shrq $32, %rcx
1363; AVX512VL-NEXT:    shrl $16, %edx
1364; AVX512VL-NEXT:    movswl %dx, %edx
1365; AVX512VL-NEXT:    vmovd %edx, %xmm0
1366; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1367; AVX512VL-NEXT:    vmovd %esi, %xmm1
1368; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1369; AVX512VL-NEXT:    movswl %cx, %ecx
1370; AVX512VL-NEXT:    vmovd %ecx, %xmm2
1371; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
1372; AVX512VL-NEXT:    cwtl
1373; AVX512VL-NEXT:    vmovd %eax, %xmm3
1374; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
1375; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1376; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1377; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1378; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1379; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1380; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1381; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1382; AVX512VL-NEXT:    retq
1383  %1 = bitcast <4 x i16> %a0 to <4 x half>
1384  %2 = fpext <4 x half> %1 to <4 x double>
1385  ret <4 x double> %2
1386}
1387
1388define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
1389; AVX1-LABEL: cvt_8i16_to_2f64:
1390; AVX1:       # %bb.0:
1391; AVX1-NEXT:    vmovd %xmm0, %eax
1392; AVX1-NEXT:    movswl %ax, %ecx
1393; AVX1-NEXT:    shrl $16, %eax
1394; AVX1-NEXT:    cwtl
1395; AVX1-NEXT:    vmovd %eax, %xmm0
1396; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1397; AVX1-NEXT:    vmovd %ecx, %xmm1
1398; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1399; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1400; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1401; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1402; AVX1-NEXT:    retq
1403;
1404; AVX2-LABEL: cvt_8i16_to_2f64:
1405; AVX2:       # %bb.0:
1406; AVX2-NEXT:    vmovd %xmm0, %eax
1407; AVX2-NEXT:    movswl %ax, %ecx
1408; AVX2-NEXT:    shrl $16, %eax
1409; AVX2-NEXT:    cwtl
1410; AVX2-NEXT:    vmovd %eax, %xmm0
1411; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1412; AVX2-NEXT:    vmovd %ecx, %xmm1
1413; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1414; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1415; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1416; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1417; AVX2-NEXT:    retq
1418;
1419; AVX512F-LABEL: cvt_8i16_to_2f64:
1420; AVX512F:       # %bb.0:
1421; AVX512F-NEXT:    vmovd %xmm0, %eax
1422; AVX512F-NEXT:    movswl %ax, %ecx
1423; AVX512F-NEXT:    shrl $16, %eax
1424; AVX512F-NEXT:    cwtl
1425; AVX512F-NEXT:    vmovd %eax, %xmm0
1426; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
1427; AVX512F-NEXT:    vmovd %ecx, %xmm1
1428; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
1429; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1430; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1431; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1432; AVX512F-NEXT:    retq
1433;
1434; AVX512VL-LABEL: cvt_8i16_to_2f64:
1435; AVX512VL:       # %bb.0:
1436; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1437; AVX512VL-NEXT:    vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
1438; AVX512VL-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
1439; AVX512VL-NEXT:    movswl %ax, %ecx
1440; AVX512VL-NEXT:    shrl $16, %eax
1441; AVX512VL-NEXT:    cwtl
1442; AVX512VL-NEXT:    vmovd %eax, %xmm0
1443; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1444; AVX512VL-NEXT:    vmovd %ecx, %xmm1
1445; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1446; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1447; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1448; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1449; AVX512VL-NEXT:    retq
1450  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
1451  %2 = bitcast <2 x i16> %1 to <2 x half>
1452  %3 = fpext <2 x half> %2 to <2 x double>
1453  ret <2 x double> %3
1454}
1455
1456define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
1457; AVX1-LABEL: cvt_8i16_to_4f64:
1458; AVX1:       # %bb.0:
1459; AVX1-NEXT:    vmovq %xmm0, %rax
1460; AVX1-NEXT:    movq %rax, %rcx
1461; AVX1-NEXT:    movl %eax, %edx
1462; AVX1-NEXT:    movswl %ax, %esi
1463; AVX1-NEXT:    shrq $48, %rax
1464; AVX1-NEXT:    shrq $32, %rcx
1465; AVX1-NEXT:    shrl $16, %edx
1466; AVX1-NEXT:    movswl %dx, %edx
1467; AVX1-NEXT:    vmovd %edx, %xmm0
1468; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1469; AVX1-NEXT:    vmovd %esi, %xmm1
1470; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1471; AVX1-NEXT:    movswl %cx, %ecx
1472; AVX1-NEXT:    vmovd %ecx, %xmm2
1473; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
1474; AVX1-NEXT:    cwtl
1475; AVX1-NEXT:    vmovd %eax, %xmm3
1476; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
1477; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1478; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1479; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1480; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1481; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1482; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1483; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1484; AVX1-NEXT:    retq
1485;
1486; AVX2-LABEL: cvt_8i16_to_4f64:
1487; AVX2:       # %bb.0:
1488; AVX2-NEXT:    vmovq %xmm0, %rax
1489; AVX2-NEXT:    movq %rax, %rcx
1490; AVX2-NEXT:    movl %eax, %edx
1491; AVX2-NEXT:    movswl %ax, %esi
1492; AVX2-NEXT:    shrq $48, %rax
1493; AVX2-NEXT:    shrq $32, %rcx
1494; AVX2-NEXT:    shrl $16, %edx
1495; AVX2-NEXT:    movswl %dx, %edx
1496; AVX2-NEXT:    vmovd %edx, %xmm0
1497; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1498; AVX2-NEXT:    vmovd %esi, %xmm1
1499; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1500; AVX2-NEXT:    movswl %cx, %ecx
1501; AVX2-NEXT:    vmovd %ecx, %xmm2
1502; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
1503; AVX2-NEXT:    cwtl
1504; AVX2-NEXT:    vmovd %eax, %xmm3
1505; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
1506; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1507; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1508; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1509; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1510; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1511; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1512; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1513; AVX2-NEXT:    retq
1514;
1515; AVX512F-LABEL: cvt_8i16_to_4f64:
1516; AVX512F:       # %bb.0:
1517; AVX512F-NEXT:    vmovq %xmm0, %rax
1518; AVX512F-NEXT:    movq %rax, %rcx
1519; AVX512F-NEXT:    movl %eax, %edx
1520; AVX512F-NEXT:    movswl %ax, %esi
1521; AVX512F-NEXT:    shrq $48, %rax
1522; AVX512F-NEXT:    shrq $32, %rcx
1523; AVX512F-NEXT:    shrl $16, %edx
1524; AVX512F-NEXT:    movswl %dx, %edx
1525; AVX512F-NEXT:    vmovd %edx, %xmm0
1526; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
1527; AVX512F-NEXT:    vmovd %esi, %xmm1
1528; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
1529; AVX512F-NEXT:    movswl %cx, %ecx
1530; AVX512F-NEXT:    vmovd %ecx, %xmm2
1531; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
1532; AVX512F-NEXT:    cwtl
1533; AVX512F-NEXT:    vmovd %eax, %xmm3
1534; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
1535; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1536; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1537; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1538; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1539; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1540; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1541; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1542; AVX512F-NEXT:    retq
1543;
1544; AVX512VL-LABEL: cvt_8i16_to_4f64:
1545; AVX512VL:       # %bb.0:
1546; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1547; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
1548; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1549; AVX512VL-NEXT:    movq %rax, %rcx
1550; AVX512VL-NEXT:    movl %eax, %edx
1551; AVX512VL-NEXT:    movswl %ax, %esi
1552; AVX512VL-NEXT:    shrq $48, %rax
1553; AVX512VL-NEXT:    shrq $32, %rcx
1554; AVX512VL-NEXT:    shrl $16, %edx
1555; AVX512VL-NEXT:    movswl %dx, %edx
1556; AVX512VL-NEXT:    vmovd %edx, %xmm0
1557; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1558; AVX512VL-NEXT:    vmovd %esi, %xmm1
1559; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1560; AVX512VL-NEXT:    movswl %cx, %ecx
1561; AVX512VL-NEXT:    vmovd %ecx, %xmm2
1562; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
1563; AVX512VL-NEXT:    cwtl
1564; AVX512VL-NEXT:    vmovd %eax, %xmm3
1565; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
1566; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1567; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1568; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1569; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1570; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1571; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1572; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1573; AVX512VL-NEXT:    retq
1574  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1575  %2 = bitcast <4 x i16> %1 to <4 x half>
1576  %3 = fpext <4 x half> %2 to <4 x double>
1577  ret <4 x double> %3
1578}
1579
1580define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
1581; AVX1-LABEL: cvt_8i16_to_8f64:
1582; AVX1:       # %bb.0:
1583; AVX1-NEXT:    vmovq %xmm0, %rdx
1584; AVX1-NEXT:    movq %rdx, %r9
1585; AVX1-NEXT:    movl %edx, %r10d
1586; AVX1-NEXT:    movswl %dx, %r8d
1587; AVX1-NEXT:    shrq $48, %rdx
1588; AVX1-NEXT:    shrq $32, %r9
1589; AVX1-NEXT:    shrl $16, %r10d
1590; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
1591; AVX1-NEXT:    movq %rdi, %rsi
1592; AVX1-NEXT:    movl %edi, %eax
1593; AVX1-NEXT:    movswl %di, %ecx
1594; AVX1-NEXT:    shrq $48, %rdi
1595; AVX1-NEXT:    shrq $32, %rsi
1596; AVX1-NEXT:    shrl $16, %eax
1597; AVX1-NEXT:    cwtl
1598; AVX1-NEXT:    vmovd %eax, %xmm0
1599; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
1600; AVX1-NEXT:    vmovd %ecx, %xmm0
1601; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
1602; AVX1-NEXT:    movswl %si, %eax
1603; AVX1-NEXT:    vmovd %eax, %xmm0
1604; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
1605; AVX1-NEXT:    movswl %di, %eax
1606; AVX1-NEXT:    vmovd %eax, %xmm0
1607; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm4
1608; AVX1-NEXT:    movswl %r10w, %eax
1609; AVX1-NEXT:    vmovd %eax, %xmm0
1610; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1611; AVX1-NEXT:    vmovd %r8d, %xmm5
1612; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
1613; AVX1-NEXT:    movswl %r9w, %eax
1614; AVX1-NEXT:    vmovd %eax, %xmm6
1615; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
1616; AVX1-NEXT:    movswl %dx, %eax
1617; AVX1-NEXT:    vmovd %eax, %xmm7
1618; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
1619; AVX1-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1620; AVX1-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1621; AVX1-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1622; AVX1-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1623; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1624; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
1625; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
1626; AVX1-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1627; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1628; AVX1-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1629; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1630; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1631; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1632; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1633; AVX1-NEXT:    retq
1634;
1635; AVX2-LABEL: cvt_8i16_to_8f64:
1636; AVX2:       # %bb.0:
1637; AVX2-NEXT:    vmovq %xmm0, %rdx
1638; AVX2-NEXT:    movq %rdx, %r9
1639; AVX2-NEXT:    movl %edx, %r10d
1640; AVX2-NEXT:    movswl %dx, %r8d
1641; AVX2-NEXT:    shrq $48, %rdx
1642; AVX2-NEXT:    shrq $32, %r9
1643; AVX2-NEXT:    shrl $16, %r10d
1644; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
1645; AVX2-NEXT:    movq %rdi, %rsi
1646; AVX2-NEXT:    movl %edi, %eax
1647; AVX2-NEXT:    movswl %di, %ecx
1648; AVX2-NEXT:    shrq $48, %rdi
1649; AVX2-NEXT:    shrq $32, %rsi
1650; AVX2-NEXT:    shrl $16, %eax
1651; AVX2-NEXT:    cwtl
1652; AVX2-NEXT:    vmovd %eax, %xmm0
1653; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
1654; AVX2-NEXT:    vmovd %ecx, %xmm0
1655; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
1656; AVX2-NEXT:    movswl %si, %eax
1657; AVX2-NEXT:    vmovd %eax, %xmm0
1658; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
1659; AVX2-NEXT:    movswl %di, %eax
1660; AVX2-NEXT:    vmovd %eax, %xmm0
1661; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm4
1662; AVX2-NEXT:    movswl %r10w, %eax
1663; AVX2-NEXT:    vmovd %eax, %xmm0
1664; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1665; AVX2-NEXT:    vmovd %r8d, %xmm5
1666; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
1667; AVX2-NEXT:    movswl %r9w, %eax
1668; AVX2-NEXT:    vmovd %eax, %xmm6
1669; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
1670; AVX2-NEXT:    movswl %dx, %eax
1671; AVX2-NEXT:    vmovd %eax, %xmm7
1672; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
1673; AVX2-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1674; AVX2-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1675; AVX2-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1676; AVX2-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1677; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1678; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
1679; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
1680; AVX2-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1681; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1682; AVX2-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1683; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1684; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1685; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1686; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1687; AVX2-NEXT:    retq
1688;
1689; AVX512-LABEL: cvt_8i16_to_8f64:
1690; AVX512:       # %bb.0:
1691; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
1692; AVX512-NEXT:    movq %rdx, %r9
1693; AVX512-NEXT:    movl %edx, %r10d
1694; AVX512-NEXT:    movswl %dx, %r8d
1695; AVX512-NEXT:    shrq $48, %rdx
1696; AVX512-NEXT:    shrq $32, %r9
1697; AVX512-NEXT:    shrl $16, %r10d
1698; AVX512-NEXT:    vmovq %xmm0, %rdi
1699; AVX512-NEXT:    movq %rdi, %rsi
1700; AVX512-NEXT:    movl %edi, %eax
1701; AVX512-NEXT:    movswl %di, %ecx
1702; AVX512-NEXT:    shrq $48, %rdi
1703; AVX512-NEXT:    shrq $32, %rsi
1704; AVX512-NEXT:    shrl $16, %eax
1705; AVX512-NEXT:    cwtl
1706; AVX512-NEXT:    vmovd %eax, %xmm0
1707; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1708; AVX512-NEXT:    vmovd %ecx, %xmm1
1709; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
1710; AVX512-NEXT:    movswl %si, %eax
1711; AVX512-NEXT:    vmovd %eax, %xmm2
1712; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1713; AVX512-NEXT:    movswl %di, %eax
1714; AVX512-NEXT:    vmovd %eax, %xmm3
1715; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1716; AVX512-NEXT:    movswl %r10w, %eax
1717; AVX512-NEXT:    vmovd %eax, %xmm4
1718; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
1719; AVX512-NEXT:    vmovd %r8d, %xmm5
1720; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
1721; AVX512-NEXT:    movswl %r9w, %eax
1722; AVX512-NEXT:    vmovd %eax, %xmm6
1723; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
1724; AVX512-NEXT:    movswl %dx, %eax
1725; AVX512-NEXT:    vmovd %eax, %xmm7
1726; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
1727; AVX512-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1728; AVX512-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1729; AVX512-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1730; AVX512-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1731; AVX512-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1732; AVX512-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
1733; AVX512-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
1734; AVX512-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1735; AVX512-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1736; AVX512-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1737; AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1738; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1739; AVX512-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1740; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1741; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
1742; AVX512-NEXT:    retq
1743  %1 = bitcast <8 x i16> %a0 to <8 x half>
1744  %2 = fpext <8 x half> %1 to <8 x double>
1745  ret <8 x double> %2
1746}
1747
1748;
1749; Half to Double (Load)
1750;
1751
1752define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
1753; ALL-LABEL: load_cvt_i16_to_f64:
1754; ALL:       # %bb.0:
1755; ALL-NEXT:    movswl (%rdi), %eax
1756; ALL-NEXT:    vmovd %eax, %xmm0
1757; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
1758; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1759; ALL-NEXT:    retq
1760  %1 = load i16, i16* %a0
1761  %2 = bitcast i16 %1 to half
1762  %3 = fpext half %2 to double
1763  ret double %3
1764}
1765
1766define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
1767; ALL-LABEL: load_cvt_2i16_to_2f64:
1768; ALL:       # %bb.0:
1769; ALL-NEXT:    movswl (%rdi), %eax
1770; ALL-NEXT:    vmovd %eax, %xmm0
1771; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
1772; ALL-NEXT:    movswl 2(%rdi), %eax
1773; ALL-NEXT:    vmovd %eax, %xmm1
1774; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
1775; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1776; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1777; ALL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1778; ALL-NEXT:    retq
1779  %1 = load <2 x i16>, <2 x i16>* %a0
1780  %2 = bitcast <2 x i16> %1 to <2 x half>
1781  %3 = fpext <2 x half> %2 to <2 x double>
1782  ret <2 x double> %3
1783}
1784
1785define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
1786; ALL-LABEL: load_cvt_4i16_to_4f64:
1787; ALL:       # %bb.0:
1788; ALL-NEXT:    movswl (%rdi), %eax
1789; ALL-NEXT:    vmovd %eax, %xmm0
1790; ALL-NEXT:    vcvtph2ps %xmm0, %xmm0
1791; ALL-NEXT:    movswl 2(%rdi), %eax
1792; ALL-NEXT:    vmovd %eax, %xmm1
1793; ALL-NEXT:    vcvtph2ps %xmm1, %xmm1
1794; ALL-NEXT:    movswl 4(%rdi), %eax
1795; ALL-NEXT:    vmovd %eax, %xmm2
1796; ALL-NEXT:    vcvtph2ps %xmm2, %xmm2
1797; ALL-NEXT:    movswl 6(%rdi), %eax
1798; ALL-NEXT:    vmovd %eax, %xmm3
1799; ALL-NEXT:    vcvtph2ps %xmm3, %xmm3
1800; ALL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1801; ALL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1802; ALL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1803; ALL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1804; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1805; ALL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1806; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1807; ALL-NEXT:    retq
1808  %1 = load <4 x i16>, <4 x i16>* %a0
1809  %2 = bitcast <4 x i16> %1 to <4 x half>
1810  %3 = fpext <4 x half> %2 to <4 x double>
1811  ret <4 x double> %3
1812}
1813
1814define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
1815; AVX1-LABEL: load_cvt_8i16_to_4f64:
1816; AVX1:       # %bb.0:
1817; AVX1-NEXT:    movq (%rdi), %rax
1818; AVX1-NEXT:    movq %rax, %rcx
1819; AVX1-NEXT:    movl %eax, %edx
1820; AVX1-NEXT:    movswl %ax, %esi
1821; AVX1-NEXT:    shrq $48, %rax
1822; AVX1-NEXT:    shrq $32, %rcx
1823; AVX1-NEXT:    shrl $16, %edx
1824; AVX1-NEXT:    movswl %dx, %edx
1825; AVX1-NEXT:    vmovd %edx, %xmm0
1826; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1827; AVX1-NEXT:    vmovd %esi, %xmm1
1828; AVX1-NEXT:    vcvtph2ps %xmm1, %xmm1
1829; AVX1-NEXT:    movswl %cx, %ecx
1830; AVX1-NEXT:    vmovd %ecx, %xmm2
1831; AVX1-NEXT:    vcvtph2ps %xmm2, %xmm2
1832; AVX1-NEXT:    cwtl
1833; AVX1-NEXT:    vmovd %eax, %xmm3
1834; AVX1-NEXT:    vcvtph2ps %xmm3, %xmm3
1835; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1836; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1837; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1838; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1839; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1840; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1841; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1842; AVX1-NEXT:    retq
1843;
1844; AVX2-LABEL: load_cvt_8i16_to_4f64:
1845; AVX2:       # %bb.0:
1846; AVX2-NEXT:    movq (%rdi), %rax
1847; AVX2-NEXT:    movq %rax, %rcx
1848; AVX2-NEXT:    movl %eax, %edx
1849; AVX2-NEXT:    movswl %ax, %esi
1850; AVX2-NEXT:    shrq $48, %rax
1851; AVX2-NEXT:    shrq $32, %rcx
1852; AVX2-NEXT:    shrl $16, %edx
1853; AVX2-NEXT:    movswl %dx, %edx
1854; AVX2-NEXT:    vmovd %edx, %xmm0
1855; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1856; AVX2-NEXT:    vmovd %esi, %xmm1
1857; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
1858; AVX2-NEXT:    movswl %cx, %ecx
1859; AVX2-NEXT:    vmovd %ecx, %xmm2
1860; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
1861; AVX2-NEXT:    cwtl
1862; AVX2-NEXT:    vmovd %eax, %xmm3
1863; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
1864; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1865; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1866; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1867; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1868; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1869; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1870; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1871; AVX2-NEXT:    retq
1872;
1873; AVX512F-LABEL: load_cvt_8i16_to_4f64:
1874; AVX512F:       # %bb.0:
1875; AVX512F-NEXT:    movq (%rdi), %rax
1876; AVX512F-NEXT:    movq %rax, %rcx
1877; AVX512F-NEXT:    movl %eax, %edx
1878; AVX512F-NEXT:    movswl %ax, %esi
1879; AVX512F-NEXT:    shrq $48, %rax
1880; AVX512F-NEXT:    shrq $32, %rcx
1881; AVX512F-NEXT:    shrl $16, %edx
1882; AVX512F-NEXT:    movswl %dx, %edx
1883; AVX512F-NEXT:    vmovd %edx, %xmm0
1884; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
1885; AVX512F-NEXT:    vmovd %esi, %xmm1
1886; AVX512F-NEXT:    vcvtph2ps %xmm1, %xmm1
1887; AVX512F-NEXT:    movswl %cx, %ecx
1888; AVX512F-NEXT:    vmovd %ecx, %xmm2
1889; AVX512F-NEXT:    vcvtph2ps %xmm2, %xmm2
1890; AVX512F-NEXT:    cwtl
1891; AVX512F-NEXT:    vmovd %eax, %xmm3
1892; AVX512F-NEXT:    vcvtph2ps %xmm3, %xmm3
1893; AVX512F-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1894; AVX512F-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1895; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1896; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1897; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1898; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1899; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1900; AVX512F-NEXT:    retq
1901;
1902; AVX512VL-LABEL: load_cvt_8i16_to_4f64:
1903; AVX512VL:       # %bb.0:
1904; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1905; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
1906; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
1907; AVX512VL-NEXT:    movq %rax, %rcx
1908; AVX512VL-NEXT:    movl %eax, %edx
1909; AVX512VL-NEXT:    movswl %ax, %esi
1910; AVX512VL-NEXT:    shrq $48, %rax
1911; AVX512VL-NEXT:    shrq $32, %rcx
1912; AVX512VL-NEXT:    shrl $16, %edx
1913; AVX512VL-NEXT:    movswl %dx, %edx
1914; AVX512VL-NEXT:    vmovd %edx, %xmm0
1915; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
1916; AVX512VL-NEXT:    vmovd %esi, %xmm1
1917; AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
1918; AVX512VL-NEXT:    movswl %cx, %ecx
1919; AVX512VL-NEXT:    vmovd %ecx, %xmm2
1920; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
1921; AVX512VL-NEXT:    cwtl
1922; AVX512VL-NEXT:    vmovd %eax, %xmm3
1923; AVX512VL-NEXT:    vcvtph2ps %xmm3, %xmm3
1924; AVX512VL-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1925; AVX512VL-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1926; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1927; AVX512VL-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1928; AVX512VL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1929; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1930; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1931; AVX512VL-NEXT:    retq
1932  %1 = load <8 x i16>, <8 x i16>* %a0
1933  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1934  %3 = bitcast <4 x i16> %2 to <4 x half>
1935  %4 = fpext <4 x half> %3 to <4 x double>
1936  ret <4 x double> %4
1937}
1938
1939define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
1940; AVX1-LABEL: load_cvt_8i16_to_8f64:
1941; AVX1:       # %bb.0:
1942; AVX1-NEXT:    movswl 8(%rdi), %eax
1943; AVX1-NEXT:    vmovd %eax, %xmm0
1944; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
1945; AVX1-NEXT:    movswl 10(%rdi), %eax
1946; AVX1-NEXT:    vmovd %eax, %xmm0
1947; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
1948; AVX1-NEXT:    movswl 12(%rdi), %eax
1949; AVX1-NEXT:    vmovd %eax, %xmm0
1950; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
1951; AVX1-NEXT:    movswl 14(%rdi), %eax
1952; AVX1-NEXT:    vmovd %eax, %xmm0
1953; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm4
1954; AVX1-NEXT:    movswl (%rdi), %eax
1955; AVX1-NEXT:    vmovd %eax, %xmm0
1956; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
1957; AVX1-NEXT:    movswl 2(%rdi), %eax
1958; AVX1-NEXT:    vmovd %eax, %xmm5
1959; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
1960; AVX1-NEXT:    movswl 4(%rdi), %eax
1961; AVX1-NEXT:    vmovd %eax, %xmm6
1962; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
1963; AVX1-NEXT:    movswl 6(%rdi), %eax
1964; AVX1-NEXT:    vmovd %eax, %xmm7
1965; AVX1-NEXT:    vcvtph2ps %xmm7, %xmm7
1966; AVX1-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
1967; AVX1-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
1968; AVX1-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1969; AVX1-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
1970; AVX1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
1971; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1972; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
1973; AVX1-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
1974; AVX1-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
1975; AVX1-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1976; AVX1-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
1977; AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
1978; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1979; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1980; AVX1-NEXT:    retq
1981;
1982; AVX2-LABEL: load_cvt_8i16_to_8f64:
1983; AVX2:       # %bb.0:
1984; AVX2-NEXT:    movswl 8(%rdi), %eax
1985; AVX2-NEXT:    vmovd %eax, %xmm0
1986; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
1987; AVX2-NEXT:    movswl 10(%rdi), %eax
1988; AVX2-NEXT:    vmovd %eax, %xmm0
1989; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
1990; AVX2-NEXT:    movswl 12(%rdi), %eax
1991; AVX2-NEXT:    vmovd %eax, %xmm0
1992; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
1993; AVX2-NEXT:    movswl 14(%rdi), %eax
1994; AVX2-NEXT:    vmovd %eax, %xmm0
1995; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm4
1996; AVX2-NEXT:    movswl (%rdi), %eax
1997; AVX2-NEXT:    vmovd %eax, %xmm0
1998; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
1999; AVX2-NEXT:    movswl 2(%rdi), %eax
2000; AVX2-NEXT:    vmovd %eax, %xmm5
2001; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
2002; AVX2-NEXT:    movswl 4(%rdi), %eax
2003; AVX2-NEXT:    vmovd %eax, %xmm6
2004; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
2005; AVX2-NEXT:    movswl 6(%rdi), %eax
2006; AVX2-NEXT:    vmovd %eax, %xmm7
2007; AVX2-NEXT:    vcvtph2ps %xmm7, %xmm7
2008; AVX2-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
2009; AVX2-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
2010; AVX2-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2011; AVX2-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
2012; AVX2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2013; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
2014; AVX2-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
2015; AVX2-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
2016; AVX2-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2017; AVX2-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
2018; AVX2-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2019; AVX2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2020; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2021; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
2022; AVX2-NEXT:    retq
2023;
2024; AVX512-LABEL: load_cvt_8i16_to_8f64:
2025; AVX512:       # %bb.0:
2026; AVX512-NEXT:    movswl (%rdi), %eax
2027; AVX512-NEXT:    vmovd %eax, %xmm0
2028; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
2029; AVX512-NEXT:    movswl 2(%rdi), %eax
2030; AVX512-NEXT:    vmovd %eax, %xmm1
2031; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
2032; AVX512-NEXT:    movswl 4(%rdi), %eax
2033; AVX512-NEXT:    vmovd %eax, %xmm2
2034; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
2035; AVX512-NEXT:    movswl 6(%rdi), %eax
2036; AVX512-NEXT:    vmovd %eax, %xmm3
2037; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
2038; AVX512-NEXT:    movswl 8(%rdi), %eax
2039; AVX512-NEXT:    vmovd %eax, %xmm4
2040; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
2041; AVX512-NEXT:    movswl 10(%rdi), %eax
2042; AVX512-NEXT:    vmovd %eax, %xmm5
2043; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
2044; AVX512-NEXT:    movswl 12(%rdi), %eax
2045; AVX512-NEXT:    vmovd %eax, %xmm6
2046; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm6
2047; AVX512-NEXT:    movswl 14(%rdi), %eax
2048; AVX512-NEXT:    vmovd %eax, %xmm7
2049; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm7
2050; AVX512-NEXT:    vcvtss2sd %xmm7, %xmm7, %xmm7
2051; AVX512-NEXT:    vcvtss2sd %xmm6, %xmm6, %xmm6
2052; AVX512-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2053; AVX512-NEXT:    vcvtss2sd %xmm5, %xmm5, %xmm5
2054; AVX512-NEXT:    vcvtss2sd %xmm4, %xmm4, %xmm4
2055; AVX512-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
2056; AVX512-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
2057; AVX512-NEXT:    vcvtss2sd %xmm3, %xmm3, %xmm3
2058; AVX512-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
2059; AVX512-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2060; AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
2061; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
2062; AVX512-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2063; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2064; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
2065; AVX512-NEXT:    retq
2066  %1 = load <8 x i16>, <8 x i16>* %a0
2067  %2 = bitcast <8 x i16> %1 to <8 x half>
2068  %3 = fpext <8 x half> %2 to <8 x double>
2069  ret <8 x double> %3
2070}
2071
2072;
2073; Float to Half
2074;
2075
2076define i16 @cvt_f32_to_i16(float %a0) nounwind {
2077; ALL-LABEL: cvt_f32_to_i16:
2078; ALL:       # %bb.0:
2079; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2080; ALL-NEXT:    vmovd %xmm0, %eax
2081; ALL-NEXT:    # kill: def $ax killed $ax killed $eax
2082; ALL-NEXT:    retq
2083  %1 = fptrunc float %a0 to half
2084  %2 = bitcast half %1 to i16
2085  ret i16 %2
2086}
2087
2088define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
2089; ALL-LABEL: cvt_4f32_to_4i16:
2090; ALL:       # %bb.0:
2091; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2092; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2093; ALL-NEXT:    vmovd %xmm1, %eax
2094; ALL-NEXT:    shll $16, %eax
2095; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2096; ALL-NEXT:    vmovd %xmm1, %ecx
2097; ALL-NEXT:    movzwl %cx, %ecx
2098; ALL-NEXT:    orl %eax, %ecx
2099; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2100; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2101; ALL-NEXT:    vmovd %xmm1, %eax
2102; ALL-NEXT:    shll $16, %eax
2103; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2104; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2105; ALL-NEXT:    vmovd %xmm0, %edx
2106; ALL-NEXT:    movzwl %dx, %edx
2107; ALL-NEXT:    orl %eax, %edx
2108; ALL-NEXT:    shlq $32, %rdx
2109; ALL-NEXT:    orq %rcx, %rdx
2110; ALL-NEXT:    vmovq %rdx, %xmm0
2111; ALL-NEXT:    retq
2112  %1 = fptrunc <4 x float> %a0 to <4 x half>
2113  %2 = bitcast <4 x half> %1 to <4 x i16>
2114  ret <4 x i16> %2
2115}
2116
2117define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
2118; ALL-LABEL: cvt_4f32_to_8i16_undef:
2119; ALL:       # %bb.0:
2120; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2121; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2122; ALL-NEXT:    vmovd %xmm1, %eax
2123; ALL-NEXT:    shll $16, %eax
2124; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2125; ALL-NEXT:    vmovd %xmm1, %ecx
2126; ALL-NEXT:    movzwl %cx, %ecx
2127; ALL-NEXT:    orl %eax, %ecx
2128; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2129; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2130; ALL-NEXT:    vmovd %xmm1, %eax
2131; ALL-NEXT:    shll $16, %eax
2132; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2133; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2134; ALL-NEXT:    vmovd %xmm0, %edx
2135; ALL-NEXT:    movzwl %dx, %edx
2136; ALL-NEXT:    orl %eax, %edx
2137; ALL-NEXT:    shlq $32, %rdx
2138; ALL-NEXT:    orq %rcx, %rdx
2139; ALL-NEXT:    vmovq %rdx, %xmm0
2140; ALL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2141; ALL-NEXT:    retq
2142  %1 = fptrunc <4 x float> %a0 to <4 x half>
2143  %2 = bitcast <4 x half> %1 to <4 x i16>
2144  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2145  ret <8 x i16> %3
2146}
2147
2148define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
2149; AVX1-LABEL: cvt_4f32_to_8i16_zero:
2150; AVX1:       # %bb.0:
2151; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2152; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2153; AVX1-NEXT:    vmovd %xmm1, %eax
2154; AVX1-NEXT:    shll $16, %eax
2155; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2156; AVX1-NEXT:    vmovd %xmm1, %ecx
2157; AVX1-NEXT:    movzwl %cx, %ecx
2158; AVX1-NEXT:    orl %eax, %ecx
2159; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2160; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2161; AVX1-NEXT:    vmovd %xmm1, %eax
2162; AVX1-NEXT:    shll $16, %eax
2163; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2164; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2165; AVX1-NEXT:    vmovd %xmm0, %edx
2166; AVX1-NEXT:    movzwl %dx, %edx
2167; AVX1-NEXT:    orl %eax, %edx
2168; AVX1-NEXT:    shlq $32, %rdx
2169; AVX1-NEXT:    orq %rcx, %rdx
2170; AVX1-NEXT:    vmovq %rdx, %xmm0
2171; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2172; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2173; AVX1-NEXT:    retq
2174;
2175; AVX2-SLOW-LABEL: cvt_4f32_to_8i16_zero:
2176; AVX2-SLOW:       # %bb.0:
2177; AVX2-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2178; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2179; AVX2-SLOW-NEXT:    vmovd %xmm1, %eax
2180; AVX2-SLOW-NEXT:    shll $16, %eax
2181; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2182; AVX2-SLOW-NEXT:    vmovd %xmm1, %ecx
2183; AVX2-SLOW-NEXT:    movzwl %cx, %ecx
2184; AVX2-SLOW-NEXT:    orl %eax, %ecx
2185; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2186; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2187; AVX2-SLOW-NEXT:    vmovd %xmm1, %eax
2188; AVX2-SLOW-NEXT:    shll $16, %eax
2189; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2190; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2191; AVX2-SLOW-NEXT:    vmovd %xmm0, %edx
2192; AVX2-SLOW-NEXT:    movzwl %dx, %edx
2193; AVX2-SLOW-NEXT:    orl %eax, %edx
2194; AVX2-SLOW-NEXT:    shlq $32, %rdx
2195; AVX2-SLOW-NEXT:    orq %rcx, %rdx
2196; AVX2-SLOW-NEXT:    vmovq %rdx, %xmm0
2197; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2198; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2199; AVX2-SLOW-NEXT:    retq
2200;
2201; AVX2-FAST-LABEL: cvt_4f32_to_8i16_zero:
2202; AVX2-FAST:       # %bb.0:
2203; AVX2-FAST-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2204; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2205; AVX2-FAST-NEXT:    vmovd %xmm1, %eax
2206; AVX2-FAST-NEXT:    shll $16, %eax
2207; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2208; AVX2-FAST-NEXT:    vmovd %xmm1, %ecx
2209; AVX2-FAST-NEXT:    movzwl %cx, %ecx
2210; AVX2-FAST-NEXT:    orl %eax, %ecx
2211; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2212; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2213; AVX2-FAST-NEXT:    vmovd %xmm1, %eax
2214; AVX2-FAST-NEXT:    shll $16, %eax
2215; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2216; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2217; AVX2-FAST-NEXT:    vmovd %xmm0, %edx
2218; AVX2-FAST-NEXT:    movzwl %dx, %edx
2219; AVX2-FAST-NEXT:    orl %eax, %edx
2220; AVX2-FAST-NEXT:    shlq $32, %rdx
2221; AVX2-FAST-NEXT:    orq %rcx, %rdx
2222; AVX2-FAST-NEXT:    vmovq %rdx, %xmm0
2223; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
2224; AVX2-FAST-NEXT:    retq
2225;
2226; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
2227; AVX512F:       # %bb.0:
2228; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2229; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2230; AVX512F-NEXT:    vmovd %xmm1, %eax
2231; AVX512F-NEXT:    shll $16, %eax
2232; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2233; AVX512F-NEXT:    vmovd %xmm1, %ecx
2234; AVX512F-NEXT:    movzwl %cx, %ecx
2235; AVX512F-NEXT:    orl %eax, %ecx
2236; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2237; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2238; AVX512F-NEXT:    vmovd %xmm1, %eax
2239; AVX512F-NEXT:    shll $16, %eax
2240; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2241; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2242; AVX512F-NEXT:    vmovd %xmm0, %edx
2243; AVX512F-NEXT:    movzwl %dx, %edx
2244; AVX512F-NEXT:    orl %eax, %edx
2245; AVX512F-NEXT:    shlq $32, %rdx
2246; AVX512F-NEXT:    orq %rcx, %rdx
2247; AVX512F-NEXT:    vmovq %rdx, %xmm0
2248; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2249; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2250; AVX512F-NEXT:    retq
2251;
2252; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
2253; AVX512VL:       # %bb.0:
2254; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2255; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2256; AVX512VL-NEXT:    vmovd %xmm1, %eax
2257; AVX512VL-NEXT:    shll $16, %eax
2258; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2259; AVX512VL-NEXT:    vmovd %xmm1, %ecx
2260; AVX512VL-NEXT:    movzwl %cx, %ecx
2261; AVX512VL-NEXT:    orl %eax, %ecx
2262; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2263; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2264; AVX512VL-NEXT:    vmovd %xmm1, %eax
2265; AVX512VL-NEXT:    shll $16, %eax
2266; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2267; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2268; AVX512VL-NEXT:    vmovd %xmm0, %edx
2269; AVX512VL-NEXT:    movzwl %dx, %edx
2270; AVX512VL-NEXT:    orl %eax, %edx
2271; AVX512VL-NEXT:    shlq $32, %rdx
2272; AVX512VL-NEXT:    orq %rcx, %rdx
2273; AVX512VL-NEXT:    vmovq %rdx, %xmm0
2274; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
2275; AVX512VL-NEXT:    retq
2276  %1 = fptrunc <4 x float> %a0 to <4 x half>
2277  %2 = bitcast <4 x half> %1 to <4 x i16>
2278  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2279  ret <8 x i16> %3
2280}
2281
2282define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
2283; ALL-LABEL: cvt_8f32_to_8i16:
2284; ALL:       # %bb.0:
2285; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2286; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2287; ALL-NEXT:    vmovd %xmm1, %eax
2288; ALL-NEXT:    shll $16, %eax
2289; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2290; ALL-NEXT:    vmovd %xmm1, %ecx
2291; ALL-NEXT:    movzwl %cx, %ecx
2292; ALL-NEXT:    orl %eax, %ecx
2293; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2294; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2295; ALL-NEXT:    vmovd %xmm1, %edx
2296; ALL-NEXT:    shll $16, %edx
2297; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2298; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2299; ALL-NEXT:    vmovd %xmm1, %eax
2300; ALL-NEXT:    movzwl %ax, %eax
2301; ALL-NEXT:    orl %edx, %eax
2302; ALL-NEXT:    shlq $32, %rax
2303; ALL-NEXT:    orq %rcx, %rax
2304; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
2305; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2306; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2307; ALL-NEXT:    vmovd %xmm1, %ecx
2308; ALL-NEXT:    shll $16, %ecx
2309; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2310; ALL-NEXT:    vmovd %xmm1, %edx
2311; ALL-NEXT:    movzwl %dx, %edx
2312; ALL-NEXT:    orl %ecx, %edx
2313; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2314; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2315; ALL-NEXT:    vmovd %xmm1, %ecx
2316; ALL-NEXT:    shll $16, %ecx
2317; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2318; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2319; ALL-NEXT:    vmovd %xmm0, %esi
2320; ALL-NEXT:    movzwl %si, %esi
2321; ALL-NEXT:    orl %ecx, %esi
2322; ALL-NEXT:    shlq $32, %rsi
2323; ALL-NEXT:    orq %rdx, %rsi
2324; ALL-NEXT:    vmovq %rsi, %xmm0
2325; ALL-NEXT:    vmovq %rax, %xmm1
2326; ALL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2327; ALL-NEXT:    vzeroupper
2328; ALL-NEXT:    retq
2329  %1 = fptrunc <8 x float> %a0 to <8 x half>
2330  %2 = bitcast <8 x half> %1 to <8 x i16>
2331  ret <8 x i16> %2
2332}
2333
2334define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
2335; AVX1-LABEL: cvt_16f32_to_16i16:
2336; AVX1:       # %bb.0:
2337; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
2338; AVX1-NEXT:    vmovd %xmm2, %eax
2339; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2340; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2341; AVX1-NEXT:    vmovd %eax, %xmm3
2342; AVX1-NEXT:    vmovd %xmm2, %eax
2343; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2344; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2345; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
2346; AVX1-NEXT:    vmovd %xmm2, %eax
2347; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2348; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
2349; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2350; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
2351; AVX1-NEXT:    vmovd %xmm1, %eax
2352; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
2353; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
2354; AVX1-NEXT:    vmovd %xmm1, %eax
2355; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2356; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2357; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
2358; AVX1-NEXT:    vmovd %xmm1, %eax
2359; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
2360; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2361; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
2362; AVX1-NEXT:    vmovd %xmm1, %eax
2363; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2364; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
2365; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2366; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
2367; AVX1-NEXT:    vmovd %xmm2, %eax
2368; AVX1-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
2369; AVX1-NEXT:    vmovd %xmm1, %eax
2370; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2371; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2372; AVX1-NEXT:    vmovd %eax, %xmm3
2373; AVX1-NEXT:    vmovd %xmm1, %eax
2374; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2375; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2376; AVX1-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
2377; AVX1-NEXT:    vmovd %xmm1, %eax
2378; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2379; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2380; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2381; AVX1-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
2382; AVX1-NEXT:    vmovd %xmm0, %eax
2383; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
2384; AVX1-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
2385; AVX1-NEXT:    vmovd %xmm0, %eax
2386; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2387; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2388; AVX1-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
2389; AVX1-NEXT:    vmovd %xmm0, %eax
2390; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
2391; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2392; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
2393; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2394; AVX1-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
2395; AVX1-NEXT:    vmovd %xmm1, %eax
2396; AVX1-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
2397; AVX1-NEXT:    vmovd %xmm0, %eax
2398; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
2399; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2400; AVX1-NEXT:    retq
2401;
2402; AVX2-LABEL: cvt_16f32_to_16i16:
2403; AVX2:       # %bb.0:
2404; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
2405; AVX2-NEXT:    vmovd %xmm2, %eax
2406; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2407; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2408; AVX2-NEXT:    vmovd %eax, %xmm3
2409; AVX2-NEXT:    vmovd %xmm2, %eax
2410; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2411; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2412; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
2413; AVX2-NEXT:    vmovd %xmm2, %eax
2414; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
2415; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
2416; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2417; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
2418; AVX2-NEXT:    vmovd %xmm1, %eax
2419; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
2420; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
2421; AVX2-NEXT:    vmovd %xmm1, %eax
2422; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2423; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2424; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
2425; AVX2-NEXT:    vmovd %xmm1, %eax
2426; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
2427; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2428; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
2429; AVX2-NEXT:    vmovd %xmm1, %eax
2430; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2431; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
2432; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2433; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
2434; AVX2-NEXT:    vmovd %xmm2, %eax
2435; AVX2-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
2436; AVX2-NEXT:    vmovd %xmm1, %eax
2437; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2438; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2439; AVX2-NEXT:    vmovd %eax, %xmm3
2440; AVX2-NEXT:    vmovd %xmm1, %eax
2441; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2442; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2443; AVX2-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
2444; AVX2-NEXT:    vmovd %xmm1, %eax
2445; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
2446; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2447; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2448; AVX2-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
2449; AVX2-NEXT:    vmovd %xmm0, %eax
2450; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
2451; AVX2-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
2452; AVX2-NEXT:    vmovd %xmm0, %eax
2453; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2454; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2455; AVX2-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
2456; AVX2-NEXT:    vmovd %xmm0, %eax
2457; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
2458; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2459; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
2460; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2461; AVX2-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
2462; AVX2-NEXT:    vmovd %xmm1, %eax
2463; AVX2-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
2464; AVX2-NEXT:    vmovd %xmm0, %eax
2465; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
2466; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2467; AVX2-NEXT:    retq
2468;
2469; AVX512-LABEL: cvt_16f32_to_16i16:
2470; AVX512:       # %bb.0:
2471; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
2472; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm2
2473; AVX512-NEXT:    vmovd %xmm2, %eax
2474; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2475; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2476; AVX512-NEXT:    vmovd %eax, %xmm3
2477; AVX512-NEXT:    vmovd %xmm2, %eax
2478; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2479; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2480; AVX512-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
2481; AVX512-NEXT:    vmovd %xmm2, %eax
2482; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
2483; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
2484; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2485; AVX512-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
2486; AVX512-NEXT:    vmovd %xmm1, %eax
2487; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
2488; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
2489; AVX512-NEXT:    vmovd %xmm1, %eax
2490; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2491; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2492; AVX512-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
2493; AVX512-NEXT:    vmovd %xmm1, %eax
2494; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
2495; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2496; AVX512-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
2497; AVX512-NEXT:    vmovd %xmm1, %eax
2498; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2499; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
2500; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2501; AVX512-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
2502; AVX512-NEXT:    vmovd %xmm2, %eax
2503; AVX512-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm2
2504; AVX512-NEXT:    vmovd %xmm1, %eax
2505; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2506; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2507; AVX512-NEXT:    vmovd %eax, %xmm3
2508; AVX512-NEXT:    vmovd %xmm1, %eax
2509; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2510; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2511; AVX512-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
2512; AVX512-NEXT:    vmovd %xmm1, %eax
2513; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
2514; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2515; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2516; AVX512-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
2517; AVX512-NEXT:    vmovd %xmm0, %eax
2518; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
2519; AVX512-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
2520; AVX512-NEXT:    vmovd %xmm0, %eax
2521; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2522; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2523; AVX512-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
2524; AVX512-NEXT:    vmovd %xmm0, %eax
2525; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
2526; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2527; AVX512-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
2528; AVX512-NEXT:    vmovd %xmm0, %eax
2529; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
2530; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2531; AVX512-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm1
2532; AVX512-NEXT:    vmovd %xmm0, %eax
2533; AVX512-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
2534; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2535; AVX512-NEXT:    retq
2536  %1 = fptrunc <16 x float> %a0 to <16 x half>
2537  %2 = bitcast <16 x half> %1 to <16 x i16>
2538  ret <16 x i16> %2
2539}
2540
2541;
2542; Float to Half (Store)
2543;
2544
2545define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
2546; ALL-LABEL: store_cvt_f32_to_i16:
2547; ALL:       # %bb.0:
2548; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2549; ALL-NEXT:    vmovd %xmm0, %eax
2550; ALL-NEXT:    movw %ax, (%rdi)
2551; ALL-NEXT:    retq
2552  %1 = fptrunc float %a0 to half
2553  %2 = bitcast half %1 to i16
2554  store i16 %2, i16* %a1
2555  ret void
2556}
2557
2558define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
2559; ALL-LABEL: store_cvt_4f32_to_4i16:
2560; ALL:       # %bb.0:
2561; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2562; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2563; ALL-NEXT:    vmovd %xmm1, %eax
2564; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2565; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2566; ALL-NEXT:    vmovd %xmm1, %ecx
2567; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2568; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2569; ALL-NEXT:    vmovd %xmm1, %edx
2570; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2571; ALL-NEXT:    vmovd %xmm0, %esi
2572; ALL-NEXT:    movw %si, (%rdi)
2573; ALL-NEXT:    movw %dx, 6(%rdi)
2574; ALL-NEXT:    movw %cx, 4(%rdi)
2575; ALL-NEXT:    movw %ax, 2(%rdi)
2576; ALL-NEXT:    retq
2577  %1 = fptrunc <4 x float> %a0 to <4 x half>
2578  %2 = bitcast <4 x half> %1 to <4 x i16>
2579  store <4 x i16> %2, <4 x i16>* %a1
2580  ret void
2581}
2582
2583define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind {
2584; ALL-LABEL: store_cvt_4f32_to_8i16_undef:
2585; ALL:       # %bb.0:
2586; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2587; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2588; ALL-NEXT:    vmovd %xmm1, %eax
2589; ALL-NEXT:    shll $16, %eax
2590; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2591; ALL-NEXT:    vmovd %xmm1, %ecx
2592; ALL-NEXT:    movzwl %cx, %ecx
2593; ALL-NEXT:    orl %eax, %ecx
2594; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2595; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2596; ALL-NEXT:    vmovd %xmm1, %eax
2597; ALL-NEXT:    shll $16, %eax
2598; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2599; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2600; ALL-NEXT:    vmovd %xmm0, %edx
2601; ALL-NEXT:    movzwl %dx, %edx
2602; ALL-NEXT:    orl %eax, %edx
2603; ALL-NEXT:    shlq $32, %rdx
2604; ALL-NEXT:    orq %rcx, %rdx
2605; ALL-NEXT:    vmovq %rdx, %xmm0
2606; ALL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2607; ALL-NEXT:    vmovdqa %xmm0, (%rdi)
2608; ALL-NEXT:    retq
2609  %1 = fptrunc <4 x float> %a0 to <4 x half>
2610  %2 = bitcast <4 x half> %1 to <4 x i16>
2611  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2612  store <8 x i16> %3, <8 x i16>* %a1
2613  ret void
2614}
2615
2616define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind {
2617; AVX1-LABEL: store_cvt_4f32_to_8i16_zero:
2618; AVX1:       # %bb.0:
2619; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2620; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2621; AVX1-NEXT:    vmovd %xmm1, %eax
2622; AVX1-NEXT:    shll $16, %eax
2623; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2624; AVX1-NEXT:    vmovd %xmm1, %ecx
2625; AVX1-NEXT:    movzwl %cx, %ecx
2626; AVX1-NEXT:    orl %eax, %ecx
2627; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2628; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2629; AVX1-NEXT:    vmovd %xmm1, %eax
2630; AVX1-NEXT:    shll $16, %eax
2631; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2632; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2633; AVX1-NEXT:    vmovd %xmm0, %edx
2634; AVX1-NEXT:    movzwl %dx, %edx
2635; AVX1-NEXT:    orl %eax, %edx
2636; AVX1-NEXT:    shlq $32, %rdx
2637; AVX1-NEXT:    orq %rcx, %rdx
2638; AVX1-NEXT:    vmovq %rdx, %xmm0
2639; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2640; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2641; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
2642; AVX1-NEXT:    retq
2643;
2644; AVX2-SLOW-LABEL: store_cvt_4f32_to_8i16_zero:
2645; AVX2-SLOW:       # %bb.0:
2646; AVX2-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2647; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2648; AVX2-SLOW-NEXT:    vmovd %xmm1, %eax
2649; AVX2-SLOW-NEXT:    shll $16, %eax
2650; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2651; AVX2-SLOW-NEXT:    vmovd %xmm1, %ecx
2652; AVX2-SLOW-NEXT:    movzwl %cx, %ecx
2653; AVX2-SLOW-NEXT:    orl %eax, %ecx
2654; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2655; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2656; AVX2-SLOW-NEXT:    vmovd %xmm1, %eax
2657; AVX2-SLOW-NEXT:    shll $16, %eax
2658; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2659; AVX2-SLOW-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2660; AVX2-SLOW-NEXT:    vmovd %xmm0, %edx
2661; AVX2-SLOW-NEXT:    movzwl %dx, %edx
2662; AVX2-SLOW-NEXT:    orl %eax, %edx
2663; AVX2-SLOW-NEXT:    shlq $32, %rdx
2664; AVX2-SLOW-NEXT:    orq %rcx, %rdx
2665; AVX2-SLOW-NEXT:    vmovq %rdx, %xmm0
2666; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2667; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2668; AVX2-SLOW-NEXT:    vmovdqa %xmm0, (%rdi)
2669; AVX2-SLOW-NEXT:    retq
2670;
2671; AVX2-FAST-LABEL: store_cvt_4f32_to_8i16_zero:
2672; AVX2-FAST:       # %bb.0:
2673; AVX2-FAST-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2674; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2675; AVX2-FAST-NEXT:    vmovd %xmm1, %eax
2676; AVX2-FAST-NEXT:    shll $16, %eax
2677; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2678; AVX2-FAST-NEXT:    vmovd %xmm1, %ecx
2679; AVX2-FAST-NEXT:    movzwl %cx, %ecx
2680; AVX2-FAST-NEXT:    orl %eax, %ecx
2681; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2682; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2683; AVX2-FAST-NEXT:    vmovd %xmm1, %eax
2684; AVX2-FAST-NEXT:    shll $16, %eax
2685; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2686; AVX2-FAST-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2687; AVX2-FAST-NEXT:    vmovd %xmm0, %edx
2688; AVX2-FAST-NEXT:    movzwl %dx, %edx
2689; AVX2-FAST-NEXT:    orl %eax, %edx
2690; AVX2-FAST-NEXT:    shlq $32, %rdx
2691; AVX2-FAST-NEXT:    orq %rcx, %rdx
2692; AVX2-FAST-NEXT:    vmovq %rdx, %xmm0
2693; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
2694; AVX2-FAST-NEXT:    vmovdqa %xmm0, (%rdi)
2695; AVX2-FAST-NEXT:    retq
2696;
2697; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
2698; AVX512F:       # %bb.0:
2699; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2700; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2701; AVX512F-NEXT:    vmovd %xmm1, %eax
2702; AVX512F-NEXT:    shll $16, %eax
2703; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2704; AVX512F-NEXT:    vmovd %xmm1, %ecx
2705; AVX512F-NEXT:    movzwl %cx, %ecx
2706; AVX512F-NEXT:    orl %eax, %ecx
2707; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2708; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2709; AVX512F-NEXT:    vmovd %xmm1, %eax
2710; AVX512F-NEXT:    shll $16, %eax
2711; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2712; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2713; AVX512F-NEXT:    vmovd %xmm0, %edx
2714; AVX512F-NEXT:    movzwl %dx, %edx
2715; AVX512F-NEXT:    orl %eax, %edx
2716; AVX512F-NEXT:    shlq $32, %rdx
2717; AVX512F-NEXT:    orq %rcx, %rdx
2718; AVX512F-NEXT:    vmovq %rdx, %xmm0
2719; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2720; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
2721; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
2722; AVX512F-NEXT:    retq
2723;
2724; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
2725; AVX512VL:       # %bb.0:
2726; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2727; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2728; AVX512VL-NEXT:    vmovd %xmm1, %eax
2729; AVX512VL-NEXT:    shll $16, %eax
2730; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
2731; AVX512VL-NEXT:    vmovd %xmm1, %ecx
2732; AVX512VL-NEXT:    movzwl %cx, %ecx
2733; AVX512VL-NEXT:    orl %eax, %ecx
2734; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2735; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2736; AVX512VL-NEXT:    vmovd %xmm1, %eax
2737; AVX512VL-NEXT:    shll $16, %eax
2738; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2739; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2740; AVX512VL-NEXT:    vmovd %xmm0, %edx
2741; AVX512VL-NEXT:    movzwl %dx, %edx
2742; AVX512VL-NEXT:    orl %eax, %edx
2743; AVX512VL-NEXT:    shlq $32, %rdx
2744; AVX512VL-NEXT:    orq %rcx, %rdx
2745; AVX512VL-NEXT:    vmovq %rdx, %xmm0
2746; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
2747; AVX512VL-NEXT:    vmovdqa %xmm0, (%rdi)
2748; AVX512VL-NEXT:    retq
2749  %1 = fptrunc <4 x float> %a0 to <4 x half>
2750  %2 = bitcast <4 x half> %1 to <4 x i16>
2751  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2752  store <8 x i16> %3, <8 x i16>* %a1
2753  ret void
2754}
2755
2756define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
2757; ALL-LABEL: store_cvt_8f32_to_8i16:
2758; ALL:       # %bb.0:
2759; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2760; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2761; ALL-NEXT:    vmovd %xmm1, %r8d
2762; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2763; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2764; ALL-NEXT:    vmovd %xmm1, %r9d
2765; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2766; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2767; ALL-NEXT:    vmovd %xmm1, %r10d
2768; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm1
2769; ALL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2770; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2771; ALL-NEXT:    vmovd %xmm2, %r11d
2772; ALL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2773; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2774; ALL-NEXT:    vmovd %xmm2, %eax
2775; ALL-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2776; ALL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2777; ALL-NEXT:    vmovd %xmm2, %ecx
2778; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2779; ALL-NEXT:    vmovd %xmm0, %edx
2780; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm0
2781; ALL-NEXT:    vmovd %xmm0, %esi
2782; ALL-NEXT:    movw %si, 8(%rdi)
2783; ALL-NEXT:    movw %dx, (%rdi)
2784; ALL-NEXT:    movw %cx, 14(%rdi)
2785; ALL-NEXT:    movw %ax, 12(%rdi)
2786; ALL-NEXT:    movw %r11w, 10(%rdi)
2787; ALL-NEXT:    movw %r10w, 6(%rdi)
2788; ALL-NEXT:    movw %r9w, 4(%rdi)
2789; ALL-NEXT:    movw %r8w, 2(%rdi)
2790; ALL-NEXT:    vzeroupper
2791; ALL-NEXT:    retq
2792  %1 = fptrunc <8 x float> %a0 to <8 x half>
2793  %2 = bitcast <8 x half> %1 to <8 x i16>
2794  store <8 x i16> %2, <8 x i16>* %a1
2795  ret void
2796}
2797
2798define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwind {
2799; AVX1-LABEL: store_cvt_16f32_to_16i16:
2800; AVX1:       # %bb.0:
2801; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2802; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2803; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
2804; AVX1-NEXT:    vmovd %xmm4, %eax
2805; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
2806; AVX1-NEXT:    movw %ax, 24(%rdi)
2807; AVX1-NEXT:    vmovd %xmm4, %eax
2808; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
2809; AVX1-NEXT:    movw %ax, 16(%rdi)
2810; AVX1-NEXT:    vmovd %xmm4, %eax
2811; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
2812; AVX1-NEXT:    movw %ax, 8(%rdi)
2813; AVX1-NEXT:    vmovd %xmm4, %eax
2814; AVX1-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2815; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2816; AVX1-NEXT:    movw %ax, (%rdi)
2817; AVX1-NEXT:    vmovd %xmm4, %eax
2818; AVX1-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2819; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2820; AVX1-NEXT:    movw %ax, 30(%rdi)
2821; AVX1-NEXT:    vmovd %xmm4, %eax
2822; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2823; AVX1-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2824; AVX1-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2825; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2826; AVX1-NEXT:    movw %ax, 28(%rdi)
2827; AVX1-NEXT:    vmovd %xmm3, %eax
2828; AVX1-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
2829; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2830; AVX1-NEXT:    movw %ax, 26(%rdi)
2831; AVX1-NEXT:    vmovd %xmm3, %eax
2832; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
2833; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2834; AVX1-NEXT:    movw %ax, 22(%rdi)
2835; AVX1-NEXT:    vmovd %xmm3, %eax
2836; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2837; AVX1-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2838; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2839; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2840; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
2841; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2842; AVX1-NEXT:    movw %ax, 20(%rdi)
2843; AVX1-NEXT:    vmovd %xmm1, %eax
2844; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
2845; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2846; AVX1-NEXT:    movw %ax, 18(%rdi)
2847; AVX1-NEXT:    vmovd %xmm1, %eax
2848; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2849; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2850; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
2851; AVX1-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2852; AVX1-NEXT:    movw %ax, 14(%rdi)
2853; AVX1-NEXT:    vmovd %xmm2, %eax
2854; AVX1-NEXT:    movw %ax, 12(%rdi)
2855; AVX1-NEXT:    vmovd %xmm1, %eax
2856; AVX1-NEXT:    movw %ax, 10(%rdi)
2857; AVX1-NEXT:    vmovd %xmm0, %eax
2858; AVX1-NEXT:    movw %ax, 6(%rdi)
2859; AVX1-NEXT:    vmovd %xmm3, %eax
2860; AVX1-NEXT:    movw %ax, 4(%rdi)
2861; AVX1-NEXT:    vmovd %xmm4, %eax
2862; AVX1-NEXT:    movw %ax, 2(%rdi)
2863; AVX1-NEXT:    vzeroupper
2864; AVX1-NEXT:    retq
2865;
2866; AVX2-LABEL: store_cvt_16f32_to_16i16:
2867; AVX2:       # %bb.0:
2868; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
2869; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm3
2870; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
2871; AVX2-NEXT:    vmovd %xmm4, %eax
2872; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
2873; AVX2-NEXT:    movw %ax, 24(%rdi)
2874; AVX2-NEXT:    vmovd %xmm4, %eax
2875; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
2876; AVX2-NEXT:    movw %ax, 16(%rdi)
2877; AVX2-NEXT:    vmovd %xmm4, %eax
2878; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
2879; AVX2-NEXT:    movw %ax, 8(%rdi)
2880; AVX2-NEXT:    vmovd %xmm4, %eax
2881; AVX2-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2882; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2883; AVX2-NEXT:    movw %ax, (%rdi)
2884; AVX2-NEXT:    vmovd %xmm4, %eax
2885; AVX2-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2886; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2887; AVX2-NEXT:    movw %ax, 30(%rdi)
2888; AVX2-NEXT:    vmovd %xmm4, %eax
2889; AVX2-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2890; AVX2-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2891; AVX2-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2892; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2893; AVX2-NEXT:    movw %ax, 28(%rdi)
2894; AVX2-NEXT:    vmovd %xmm3, %eax
2895; AVX2-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
2896; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2897; AVX2-NEXT:    movw %ax, 26(%rdi)
2898; AVX2-NEXT:    vmovd %xmm3, %eax
2899; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
2900; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2901; AVX2-NEXT:    movw %ax, 22(%rdi)
2902; AVX2-NEXT:    vmovd %xmm3, %eax
2903; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2904; AVX2-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2905; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2906; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2907; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
2908; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2909; AVX2-NEXT:    movw %ax, 20(%rdi)
2910; AVX2-NEXT:    vmovd %xmm1, %eax
2911; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
2912; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2913; AVX2-NEXT:    movw %ax, 18(%rdi)
2914; AVX2-NEXT:    vmovd %xmm1, %eax
2915; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2916; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2917; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
2918; AVX2-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2919; AVX2-NEXT:    movw %ax, 14(%rdi)
2920; AVX2-NEXT:    vmovd %xmm2, %eax
2921; AVX2-NEXT:    movw %ax, 12(%rdi)
2922; AVX2-NEXT:    vmovd %xmm1, %eax
2923; AVX2-NEXT:    movw %ax, 10(%rdi)
2924; AVX2-NEXT:    vmovd %xmm0, %eax
2925; AVX2-NEXT:    movw %ax, 6(%rdi)
2926; AVX2-NEXT:    vmovd %xmm3, %eax
2927; AVX2-NEXT:    movw %ax, 4(%rdi)
2928; AVX2-NEXT:    vmovd %xmm4, %eax
2929; AVX2-NEXT:    movw %ax, 2(%rdi)
2930; AVX2-NEXT:    vzeroupper
2931; AVX2-NEXT:    retq
2932;
2933; AVX512-LABEL: store_cvt_16f32_to_16i16:
2934; AVX512:       # %bb.0:
2935; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
2936; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
2937; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm3
2938; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm4
2939; AVX512-NEXT:    vmovd %xmm4, %eax
2940; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm4
2941; AVX512-NEXT:    movw %ax, 24(%rdi)
2942; AVX512-NEXT:    vmovd %xmm4, %eax
2943; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm4
2944; AVX512-NEXT:    movw %ax, 16(%rdi)
2945; AVX512-NEXT:    vmovd %xmm4, %eax
2946; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm4
2947; AVX512-NEXT:    movw %ax, 8(%rdi)
2948; AVX512-NEXT:    vmovd %xmm4, %eax
2949; AVX512-NEXT:    vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2950; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2951; AVX512-NEXT:    movw %ax, (%rdi)
2952; AVX512-NEXT:    vmovd %xmm4, %eax
2953; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2954; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2955; AVX512-NEXT:    movw %ax, 30(%rdi)
2956; AVX512-NEXT:    vmovd %xmm4, %eax
2957; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2958; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2959; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2960; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2961; AVX512-NEXT:    movw %ax, 28(%rdi)
2962; AVX512-NEXT:    vmovd %xmm3, %eax
2963; AVX512-NEXT:    vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
2964; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2965; AVX512-NEXT:    movw %ax, 26(%rdi)
2966; AVX512-NEXT:    vmovd %xmm3, %eax
2967; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
2968; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2969; AVX512-NEXT:    movw %ax, 22(%rdi)
2970; AVX512-NEXT:    vmovd %xmm3, %eax
2971; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2972; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
2973; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2974; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2975; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
2976; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2977; AVX512-NEXT:    movw %ax, 20(%rdi)
2978; AVX512-NEXT:    vmovd %xmm2, %eax
2979; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2980; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2981; AVX512-NEXT:    movw %ax, 18(%rdi)
2982; AVX512-NEXT:    vmovd %xmm2, %eax
2983; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2984; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2985; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
2986; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2987; AVX512-NEXT:    movw %ax, 14(%rdi)
2988; AVX512-NEXT:    vmovd %xmm1, %eax
2989; AVX512-NEXT:    movw %ax, 12(%rdi)
2990; AVX512-NEXT:    vmovd %xmm2, %eax
2991; AVX512-NEXT:    movw %ax, 10(%rdi)
2992; AVX512-NEXT:    vmovd %xmm0, %eax
2993; AVX512-NEXT:    movw %ax, 6(%rdi)
2994; AVX512-NEXT:    vmovd %xmm3, %eax
2995; AVX512-NEXT:    movw %ax, 4(%rdi)
2996; AVX512-NEXT:    vmovd %xmm4, %eax
2997; AVX512-NEXT:    movw %ax, 2(%rdi)
2998; AVX512-NEXT:    vzeroupper
2999; AVX512-NEXT:    retq
3000  %1 = fptrunc <16 x float> %a0 to <16 x half>
3001  %2 = bitcast <16 x half> %1 to <16 x i16>
3002  store <16 x i16> %2, <16 x i16>* %a1
3003  ret void
3004}
3005
3006;
3007; Double to Half
3008;
3009
3010define i16 @cvt_f64_to_i16(double %a0) nounwind {
3011; ALL-LABEL: cvt_f64_to_i16:
3012; ALL:       # %bb.0:
3013; ALL-NEXT:    jmp __truncdfhf2 # TAILCALL
3014  %1 = fptrunc double %a0 to half
3015  %2 = bitcast half %1 to i16
3016  ret i16 %2
3017}
3018
3019define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
3020; ALL-LABEL: cvt_2f64_to_2i16:
3021; ALL:       # %bb.0:
3022; ALL-NEXT:    pushq %rbx
3023; ALL-NEXT:    subq $16, %rsp
3024; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3025; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3026; ALL-NEXT:    callq __truncdfhf2
3027; ALL-NEXT:    movl %eax, %ebx
3028; ALL-NEXT:    shll $16, %ebx
3029; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3030; ALL-NEXT:    callq __truncdfhf2
3031; ALL-NEXT:    movzwl %ax, %eax
3032; ALL-NEXT:    orl %ebx, %eax
3033; ALL-NEXT:    vmovd %eax, %xmm0
3034; ALL-NEXT:    addq $16, %rsp
3035; ALL-NEXT:    popq %rbx
3036; ALL-NEXT:    retq
3037  %1 = fptrunc <2 x double> %a0 to <2 x half>
3038  %2 = bitcast <2 x half> %1 to <2 x i16>
3039  ret <2 x i16> %2
3040}
3041
3042define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
3043; AVX1-LABEL: cvt_4f64_to_4i16:
3044; AVX1:       # %bb.0:
3045; AVX1-NEXT:    pushq %r14
3046; AVX1-NEXT:    pushq %rbx
3047; AVX1-NEXT:    subq $40, %rsp
3048; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3049; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3050; AVX1-NEXT:    vzeroupper
3051; AVX1-NEXT:    callq __truncdfhf2
3052; AVX1-NEXT:    movl %eax, %ebx
3053; AVX1-NEXT:    shll $16, %ebx
3054; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3055; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3056; AVX1-NEXT:    vzeroupper
3057; AVX1-NEXT:    callq __truncdfhf2
3058; AVX1-NEXT:    movzwl %ax, %r14d
3059; AVX1-NEXT:    orl %ebx, %r14d
3060; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3061; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3062; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3063; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3064; AVX1-NEXT:    vzeroupper
3065; AVX1-NEXT:    callq __truncdfhf2
3066; AVX1-NEXT:    movl %eax, %ebx
3067; AVX1-NEXT:    shll $16, %ebx
3068; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3069; AVX1-NEXT:    callq __truncdfhf2
3070; AVX1-NEXT:    movzwl %ax, %eax
3071; AVX1-NEXT:    orl %ebx, %eax
3072; AVX1-NEXT:    shlq $32, %rax
3073; AVX1-NEXT:    orq %r14, %rax
3074; AVX1-NEXT:    vmovq %rax, %xmm0
3075; AVX1-NEXT:    addq $40, %rsp
3076; AVX1-NEXT:    popq %rbx
3077; AVX1-NEXT:    popq %r14
3078; AVX1-NEXT:    retq
3079;
3080; AVX2-LABEL: cvt_4f64_to_4i16:
3081; AVX2:       # %bb.0:
3082; AVX2-NEXT:    pushq %r14
3083; AVX2-NEXT:    pushq %rbx
3084; AVX2-NEXT:    subq $40, %rsp
3085; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3086; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3087; AVX2-NEXT:    vzeroupper
3088; AVX2-NEXT:    callq __truncdfhf2
3089; AVX2-NEXT:    movl %eax, %ebx
3090; AVX2-NEXT:    shll $16, %ebx
3091; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3092; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3093; AVX2-NEXT:    vzeroupper
3094; AVX2-NEXT:    callq __truncdfhf2
3095; AVX2-NEXT:    movzwl %ax, %r14d
3096; AVX2-NEXT:    orl %ebx, %r14d
3097; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3098; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3099; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3100; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3101; AVX2-NEXT:    vzeroupper
3102; AVX2-NEXT:    callq __truncdfhf2
3103; AVX2-NEXT:    movl %eax, %ebx
3104; AVX2-NEXT:    shll $16, %ebx
3105; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3106; AVX2-NEXT:    callq __truncdfhf2
3107; AVX2-NEXT:    movzwl %ax, %eax
3108; AVX2-NEXT:    orl %ebx, %eax
3109; AVX2-NEXT:    shlq $32, %rax
3110; AVX2-NEXT:    orq %r14, %rax
3111; AVX2-NEXT:    vmovq %rax, %xmm0
3112; AVX2-NEXT:    addq $40, %rsp
3113; AVX2-NEXT:    popq %rbx
3114; AVX2-NEXT:    popq %r14
3115; AVX2-NEXT:    retq
3116;
3117; AVX512-LABEL: cvt_4f64_to_4i16:
3118; AVX512:       # %bb.0:
3119; AVX512-NEXT:    pushq %r14
3120; AVX512-NEXT:    pushq %rbx
3121; AVX512-NEXT:    subq $40, %rsp
3122; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3123; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3124; AVX512-NEXT:    vzeroupper
3125; AVX512-NEXT:    callq __truncdfhf2
3126; AVX512-NEXT:    movl %eax, %ebx
3127; AVX512-NEXT:    shll $16, %ebx
3128; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3129; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3130; AVX512-NEXT:    vzeroupper
3131; AVX512-NEXT:    callq __truncdfhf2
3132; AVX512-NEXT:    movzwl %ax, %r14d
3133; AVX512-NEXT:    orl %ebx, %r14d
3134; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3135; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3136; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3137; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3138; AVX512-NEXT:    vzeroupper
3139; AVX512-NEXT:    callq __truncdfhf2
3140; AVX512-NEXT:    movl %eax, %ebx
3141; AVX512-NEXT:    shll $16, %ebx
3142; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3143; AVX512-NEXT:    callq __truncdfhf2
3144; AVX512-NEXT:    movzwl %ax, %eax
3145; AVX512-NEXT:    orl %ebx, %eax
3146; AVX512-NEXT:    shlq $32, %rax
3147; AVX512-NEXT:    orq %r14, %rax
3148; AVX512-NEXT:    vmovq %rax, %xmm0
3149; AVX512-NEXT:    addq $40, %rsp
3150; AVX512-NEXT:    popq %rbx
3151; AVX512-NEXT:    popq %r14
3152; AVX512-NEXT:    retq
3153  %1 = fptrunc <4 x double> %a0 to <4 x half>
3154  %2 = bitcast <4 x half> %1 to <4 x i16>
3155  ret <4 x i16> %2
3156}
3157
3158define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
3159; AVX1-LABEL: cvt_4f64_to_8i16_undef:
3160; AVX1:       # %bb.0:
3161; AVX1-NEXT:    pushq %r14
3162; AVX1-NEXT:    pushq %rbx
3163; AVX1-NEXT:    subq $40, %rsp
3164; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3165; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3166; AVX1-NEXT:    vzeroupper
3167; AVX1-NEXT:    callq __truncdfhf2
3168; AVX1-NEXT:    movl %eax, %ebx
3169; AVX1-NEXT:    shll $16, %ebx
3170; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3171; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3172; AVX1-NEXT:    vzeroupper
3173; AVX1-NEXT:    callq __truncdfhf2
3174; AVX1-NEXT:    movzwl %ax, %r14d
3175; AVX1-NEXT:    orl %ebx, %r14d
3176; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3177; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3178; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3179; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3180; AVX1-NEXT:    vzeroupper
3181; AVX1-NEXT:    callq __truncdfhf2
3182; AVX1-NEXT:    movl %eax, %ebx
3183; AVX1-NEXT:    shll $16, %ebx
3184; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3185; AVX1-NEXT:    callq __truncdfhf2
3186; AVX1-NEXT:    movzwl %ax, %eax
3187; AVX1-NEXT:    orl %ebx, %eax
3188; AVX1-NEXT:    shlq $32, %rax
3189; AVX1-NEXT:    orq %r14, %rax
3190; AVX1-NEXT:    vmovq %rax, %xmm0
3191; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3192; AVX1-NEXT:    addq $40, %rsp
3193; AVX1-NEXT:    popq %rbx
3194; AVX1-NEXT:    popq %r14
3195; AVX1-NEXT:    retq
3196;
3197; AVX2-LABEL: cvt_4f64_to_8i16_undef:
3198; AVX2:       # %bb.0:
3199; AVX2-NEXT:    pushq %r14
3200; AVX2-NEXT:    pushq %rbx
3201; AVX2-NEXT:    subq $40, %rsp
3202; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3203; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3204; AVX2-NEXT:    vzeroupper
3205; AVX2-NEXT:    callq __truncdfhf2
3206; AVX2-NEXT:    movl %eax, %ebx
3207; AVX2-NEXT:    shll $16, %ebx
3208; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3209; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3210; AVX2-NEXT:    vzeroupper
3211; AVX2-NEXT:    callq __truncdfhf2
3212; AVX2-NEXT:    movzwl %ax, %r14d
3213; AVX2-NEXT:    orl %ebx, %r14d
3214; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3215; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3216; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3217; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3218; AVX2-NEXT:    vzeroupper
3219; AVX2-NEXT:    callq __truncdfhf2
3220; AVX2-NEXT:    movl %eax, %ebx
3221; AVX2-NEXT:    shll $16, %ebx
3222; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3223; AVX2-NEXT:    callq __truncdfhf2
3224; AVX2-NEXT:    movzwl %ax, %eax
3225; AVX2-NEXT:    orl %ebx, %eax
3226; AVX2-NEXT:    shlq $32, %rax
3227; AVX2-NEXT:    orq %r14, %rax
3228; AVX2-NEXT:    vmovq %rax, %xmm0
3229; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3230; AVX2-NEXT:    addq $40, %rsp
3231; AVX2-NEXT:    popq %rbx
3232; AVX2-NEXT:    popq %r14
3233; AVX2-NEXT:    retq
3234;
3235; AVX512-LABEL: cvt_4f64_to_8i16_undef:
3236; AVX512:       # %bb.0:
3237; AVX512-NEXT:    pushq %r14
3238; AVX512-NEXT:    pushq %rbx
3239; AVX512-NEXT:    subq $40, %rsp
3240; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3241; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3242; AVX512-NEXT:    vzeroupper
3243; AVX512-NEXT:    callq __truncdfhf2
3244; AVX512-NEXT:    movl %eax, %ebx
3245; AVX512-NEXT:    shll $16, %ebx
3246; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3247; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3248; AVX512-NEXT:    vzeroupper
3249; AVX512-NEXT:    callq __truncdfhf2
3250; AVX512-NEXT:    movzwl %ax, %r14d
3251; AVX512-NEXT:    orl %ebx, %r14d
3252; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3253; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3254; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3255; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3256; AVX512-NEXT:    vzeroupper
3257; AVX512-NEXT:    callq __truncdfhf2
3258; AVX512-NEXT:    movl %eax, %ebx
3259; AVX512-NEXT:    shll $16, %ebx
3260; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3261; AVX512-NEXT:    callq __truncdfhf2
3262; AVX512-NEXT:    movzwl %ax, %eax
3263; AVX512-NEXT:    orl %ebx, %eax
3264; AVX512-NEXT:    shlq $32, %rax
3265; AVX512-NEXT:    orq %r14, %rax
3266; AVX512-NEXT:    vmovq %rax, %xmm0
3267; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3268; AVX512-NEXT:    addq $40, %rsp
3269; AVX512-NEXT:    popq %rbx
3270; AVX512-NEXT:    popq %r14
3271; AVX512-NEXT:    retq
3272  %1 = fptrunc <4 x double> %a0 to <4 x half>
3273  %2 = bitcast <4 x half> %1 to <4 x i16>
3274  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3275  ret <8 x i16> %3
3276}
3277
3278define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
3279; AVX1-LABEL: cvt_4f64_to_8i16_zero:
3280; AVX1:       # %bb.0:
3281; AVX1-NEXT:    pushq %r14
3282; AVX1-NEXT:    pushq %rbx
3283; AVX1-NEXT:    subq $40, %rsp
3284; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3285; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3286; AVX1-NEXT:    vzeroupper
3287; AVX1-NEXT:    callq __truncdfhf2
3288; AVX1-NEXT:    movl %eax, %ebx
3289; AVX1-NEXT:    shll $16, %ebx
3290; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3291; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3292; AVX1-NEXT:    vzeroupper
3293; AVX1-NEXT:    callq __truncdfhf2
3294; AVX1-NEXT:    movzwl %ax, %r14d
3295; AVX1-NEXT:    orl %ebx, %r14d
3296; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3297; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3298; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3299; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3300; AVX1-NEXT:    vzeroupper
3301; AVX1-NEXT:    callq __truncdfhf2
3302; AVX1-NEXT:    movl %eax, %ebx
3303; AVX1-NEXT:    shll $16, %ebx
3304; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3305; AVX1-NEXT:    callq __truncdfhf2
3306; AVX1-NEXT:    movzwl %ax, %eax
3307; AVX1-NEXT:    orl %ebx, %eax
3308; AVX1-NEXT:    shlq $32, %rax
3309; AVX1-NEXT:    orq %r14, %rax
3310; AVX1-NEXT:    vmovq %rax, %xmm0
3311; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3312; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
3313; AVX1-NEXT:    addq $40, %rsp
3314; AVX1-NEXT:    popq %rbx
3315; AVX1-NEXT:    popq %r14
3316; AVX1-NEXT:    retq
3317;
3318; AVX2-SLOW-LABEL: cvt_4f64_to_8i16_zero:
3319; AVX2-SLOW:       # %bb.0:
3320; AVX2-SLOW-NEXT:    pushq %r14
3321; AVX2-SLOW-NEXT:    pushq %rbx
3322; AVX2-SLOW-NEXT:    subq $40, %rsp
3323; AVX2-SLOW-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3324; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3325; AVX2-SLOW-NEXT:    vzeroupper
3326; AVX2-SLOW-NEXT:    callq __truncdfhf2
3327; AVX2-SLOW-NEXT:    movl %eax, %ebx
3328; AVX2-SLOW-NEXT:    shll $16, %ebx
3329; AVX2-SLOW-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3330; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3331; AVX2-SLOW-NEXT:    vzeroupper
3332; AVX2-SLOW-NEXT:    callq __truncdfhf2
3333; AVX2-SLOW-NEXT:    movzwl %ax, %r14d
3334; AVX2-SLOW-NEXT:    orl %ebx, %r14d
3335; AVX2-SLOW-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3336; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
3337; AVX2-SLOW-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3338; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3339; AVX2-SLOW-NEXT:    vzeroupper
3340; AVX2-SLOW-NEXT:    callq __truncdfhf2
3341; AVX2-SLOW-NEXT:    movl %eax, %ebx
3342; AVX2-SLOW-NEXT:    shll $16, %ebx
3343; AVX2-SLOW-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3344; AVX2-SLOW-NEXT:    callq __truncdfhf2
3345; AVX2-SLOW-NEXT:    movzwl %ax, %eax
3346; AVX2-SLOW-NEXT:    orl %ebx, %eax
3347; AVX2-SLOW-NEXT:    shlq $32, %rax
3348; AVX2-SLOW-NEXT:    orq %r14, %rax
3349; AVX2-SLOW-NEXT:    vmovq %rax, %xmm0
3350; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3351; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
3352; AVX2-SLOW-NEXT:    addq $40, %rsp
3353; AVX2-SLOW-NEXT:    popq %rbx
3354; AVX2-SLOW-NEXT:    popq %r14
3355; AVX2-SLOW-NEXT:    retq
3356;
3357; AVX2-FAST-LABEL: cvt_4f64_to_8i16_zero:
3358; AVX2-FAST:       # %bb.0:
3359; AVX2-FAST-NEXT:    pushq %r14
3360; AVX2-FAST-NEXT:    pushq %rbx
3361; AVX2-FAST-NEXT:    subq $40, %rsp
3362; AVX2-FAST-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3363; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3364; AVX2-FAST-NEXT:    vzeroupper
3365; AVX2-FAST-NEXT:    callq __truncdfhf2
3366; AVX2-FAST-NEXT:    movl %eax, %ebx
3367; AVX2-FAST-NEXT:    shll $16, %ebx
3368; AVX2-FAST-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3369; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3370; AVX2-FAST-NEXT:    vzeroupper
3371; AVX2-FAST-NEXT:    callq __truncdfhf2
3372; AVX2-FAST-NEXT:    movzwl %ax, %r14d
3373; AVX2-FAST-NEXT:    orl %ebx, %r14d
3374; AVX2-FAST-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3375; AVX2-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
3376; AVX2-FAST-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3377; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3378; AVX2-FAST-NEXT:    vzeroupper
3379; AVX2-FAST-NEXT:    callq __truncdfhf2
3380; AVX2-FAST-NEXT:    movl %eax, %ebx
3381; AVX2-FAST-NEXT:    shll $16, %ebx
3382; AVX2-FAST-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3383; AVX2-FAST-NEXT:    callq __truncdfhf2
3384; AVX2-FAST-NEXT:    movzwl %ax, %eax
3385; AVX2-FAST-NEXT:    orl %ebx, %eax
3386; AVX2-FAST-NEXT:    shlq $32, %rax
3387; AVX2-FAST-NEXT:    orq %r14, %rax
3388; AVX2-FAST-NEXT:    vmovq %rax, %xmm0
3389; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
3390; AVX2-FAST-NEXT:    addq $40, %rsp
3391; AVX2-FAST-NEXT:    popq %rbx
3392; AVX2-FAST-NEXT:    popq %r14
3393; AVX2-FAST-NEXT:    retq
3394;
3395; AVX512F-LABEL: cvt_4f64_to_8i16_zero:
3396; AVX512F:       # %bb.0:
3397; AVX512F-NEXT:    pushq %r14
3398; AVX512F-NEXT:    pushq %rbx
3399; AVX512F-NEXT:    subq $40, %rsp
3400; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3401; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3402; AVX512F-NEXT:    vzeroupper
3403; AVX512F-NEXT:    callq __truncdfhf2
3404; AVX512F-NEXT:    movl %eax, %ebx
3405; AVX512F-NEXT:    shll $16, %ebx
3406; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3407; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3408; AVX512F-NEXT:    vzeroupper
3409; AVX512F-NEXT:    callq __truncdfhf2
3410; AVX512F-NEXT:    movzwl %ax, %r14d
3411; AVX512F-NEXT:    orl %ebx, %r14d
3412; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3413; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
3414; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3415; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3416; AVX512F-NEXT:    vzeroupper
3417; AVX512F-NEXT:    callq __truncdfhf2
3418; AVX512F-NEXT:    movl %eax, %ebx
3419; AVX512F-NEXT:    shll $16, %ebx
3420; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3421; AVX512F-NEXT:    callq __truncdfhf2
3422; AVX512F-NEXT:    movzwl %ax, %eax
3423; AVX512F-NEXT:    orl %ebx, %eax
3424; AVX512F-NEXT:    shlq $32, %rax
3425; AVX512F-NEXT:    orq %r14, %rax
3426; AVX512F-NEXT:    vmovq %rax, %xmm0
3427; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3428; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
3429; AVX512F-NEXT:    addq $40, %rsp
3430; AVX512F-NEXT:    popq %rbx
3431; AVX512F-NEXT:    popq %r14
3432; AVX512F-NEXT:    retq
3433;
3434; AVX512VL-LABEL: cvt_4f64_to_8i16_zero:
3435; AVX512VL:       # %bb.0:
3436; AVX512VL-NEXT:    pushq %r14
3437; AVX512VL-NEXT:    pushq %rbx
3438; AVX512VL-NEXT:    subq $40, %rsp
3439; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3440; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3441; AVX512VL-NEXT:    vzeroupper
3442; AVX512VL-NEXT:    callq __truncdfhf2
3443; AVX512VL-NEXT:    movl %eax, %ebx
3444; AVX512VL-NEXT:    shll $16, %ebx
3445; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3446; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3447; AVX512VL-NEXT:    vzeroupper
3448; AVX512VL-NEXT:    callq __truncdfhf2
3449; AVX512VL-NEXT:    movzwl %ax, %r14d
3450; AVX512VL-NEXT:    orl %ebx, %r14d
3451; AVX512VL-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3452; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
3453; AVX512VL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3454; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3455; AVX512VL-NEXT:    vzeroupper
3456; AVX512VL-NEXT:    callq __truncdfhf2
3457; AVX512VL-NEXT:    movl %eax, %ebx
3458; AVX512VL-NEXT:    shll $16, %ebx
3459; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3460; AVX512VL-NEXT:    callq __truncdfhf2
3461; AVX512VL-NEXT:    movzwl %ax, %eax
3462; AVX512VL-NEXT:    orl %ebx, %eax
3463; AVX512VL-NEXT:    shlq $32, %rax
3464; AVX512VL-NEXT:    orq %r14, %rax
3465; AVX512VL-NEXT:    vmovq %rax, %xmm0
3466; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
3467; AVX512VL-NEXT:    addq $40, %rsp
3468; AVX512VL-NEXT:    popq %rbx
3469; AVX512VL-NEXT:    popq %r14
3470; AVX512VL-NEXT:    retq
3471  %1 = fptrunc <4 x double> %a0 to <4 x half>
3472  %2 = bitcast <4 x half> %1 to <4 x i16>
3473  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3474  ret <8 x i16> %3
3475}
3476
3477define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
3478; AVX1-LABEL: cvt_8f64_to_8i16:
3479; AVX1:       # %bb.0:
3480; AVX1-NEXT:    pushq %r15
3481; AVX1-NEXT:    pushq %r14
3482; AVX1-NEXT:    pushq %rbx
3483; AVX1-NEXT:    subq $64, %rsp
3484; AVX1-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
3485; AVX1-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3486; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3487; AVX1-NEXT:    vzeroupper
3488; AVX1-NEXT:    callq __truncdfhf2
3489; AVX1-NEXT:    movl %eax, %ebx
3490; AVX1-NEXT:    shll $16, %ebx
3491; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3492; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3493; AVX1-NEXT:    vzeroupper
3494; AVX1-NEXT:    callq __truncdfhf2
3495; AVX1-NEXT:    movzwl %ax, %r15d
3496; AVX1-NEXT:    orl %ebx, %r15d
3497; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3498; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3499; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3500; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3501; AVX1-NEXT:    vzeroupper
3502; AVX1-NEXT:    callq __truncdfhf2
3503; AVX1-NEXT:    movl %eax, %ebx
3504; AVX1-NEXT:    shll $16, %ebx
3505; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3506; AVX1-NEXT:    callq __truncdfhf2
3507; AVX1-NEXT:    movzwl %ax, %r14d
3508; AVX1-NEXT:    orl %ebx, %r14d
3509; AVX1-NEXT:    shlq $32, %r14
3510; AVX1-NEXT:    orq %r15, %r14
3511; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3512; AVX1-NEXT:    # xmm0 = mem[1,0]
3513; AVX1-NEXT:    callq __truncdfhf2
3514; AVX1-NEXT:    movl %eax, %ebx
3515; AVX1-NEXT:    shll $16, %ebx
3516; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3517; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3518; AVX1-NEXT:    vzeroupper
3519; AVX1-NEXT:    callq __truncdfhf2
3520; AVX1-NEXT:    movzwl %ax, %r15d
3521; AVX1-NEXT:    orl %ebx, %r15d
3522; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3523; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3524; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3525; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3526; AVX1-NEXT:    vzeroupper
3527; AVX1-NEXT:    callq __truncdfhf2
3528; AVX1-NEXT:    movl %eax, %ebx
3529; AVX1-NEXT:    shll $16, %ebx
3530; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3531; AVX1-NEXT:    callq __truncdfhf2
3532; AVX1-NEXT:    movzwl %ax, %eax
3533; AVX1-NEXT:    orl %ebx, %eax
3534; AVX1-NEXT:    shlq $32, %rax
3535; AVX1-NEXT:    orq %r15, %rax
3536; AVX1-NEXT:    vmovq %rax, %xmm0
3537; AVX1-NEXT:    vmovq %r14, %xmm1
3538; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3539; AVX1-NEXT:    addq $64, %rsp
3540; AVX1-NEXT:    popq %rbx
3541; AVX1-NEXT:    popq %r14
3542; AVX1-NEXT:    popq %r15
3543; AVX1-NEXT:    retq
3544;
3545; AVX2-LABEL: cvt_8f64_to_8i16:
3546; AVX2:       # %bb.0:
3547; AVX2-NEXT:    pushq %r15
3548; AVX2-NEXT:    pushq %r14
3549; AVX2-NEXT:    pushq %rbx
3550; AVX2-NEXT:    subq $64, %rsp
3551; AVX2-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
3552; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3553; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3554; AVX2-NEXT:    vzeroupper
3555; AVX2-NEXT:    callq __truncdfhf2
3556; AVX2-NEXT:    movl %eax, %ebx
3557; AVX2-NEXT:    shll $16, %ebx
3558; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3559; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3560; AVX2-NEXT:    vzeroupper
3561; AVX2-NEXT:    callq __truncdfhf2
3562; AVX2-NEXT:    movzwl %ax, %r15d
3563; AVX2-NEXT:    orl %ebx, %r15d
3564; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3565; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3566; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3567; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3568; AVX2-NEXT:    vzeroupper
3569; AVX2-NEXT:    callq __truncdfhf2
3570; AVX2-NEXT:    movl %eax, %ebx
3571; AVX2-NEXT:    shll $16, %ebx
3572; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3573; AVX2-NEXT:    callq __truncdfhf2
3574; AVX2-NEXT:    movzwl %ax, %r14d
3575; AVX2-NEXT:    orl %ebx, %r14d
3576; AVX2-NEXT:    shlq $32, %r14
3577; AVX2-NEXT:    orq %r15, %r14
3578; AVX2-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
3579; AVX2-NEXT:    # xmm0 = mem[1,0]
3580; AVX2-NEXT:    callq __truncdfhf2
3581; AVX2-NEXT:    movl %eax, %ebx
3582; AVX2-NEXT:    shll $16, %ebx
3583; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3584; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3585; AVX2-NEXT:    vzeroupper
3586; AVX2-NEXT:    callq __truncdfhf2
3587; AVX2-NEXT:    movzwl %ax, %r15d
3588; AVX2-NEXT:    orl %ebx, %r15d
3589; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3590; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3591; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3592; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3593; AVX2-NEXT:    vzeroupper
3594; AVX2-NEXT:    callq __truncdfhf2
3595; AVX2-NEXT:    movl %eax, %ebx
3596; AVX2-NEXT:    shll $16, %ebx
3597; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3598; AVX2-NEXT:    callq __truncdfhf2
3599; AVX2-NEXT:    movzwl %ax, %eax
3600; AVX2-NEXT:    orl %ebx, %eax
3601; AVX2-NEXT:    shlq $32, %rax
3602; AVX2-NEXT:    orq %r15, %rax
3603; AVX2-NEXT:    vmovq %rax, %xmm0
3604; AVX2-NEXT:    vmovq %r14, %xmm1
3605; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3606; AVX2-NEXT:    addq $64, %rsp
3607; AVX2-NEXT:    popq %rbx
3608; AVX2-NEXT:    popq %r14
3609; AVX2-NEXT:    popq %r15
3610; AVX2-NEXT:    retq
3611;
3612; AVX512-LABEL: cvt_8f64_to_8i16:
3613; AVX512:       # %bb.0:
3614; AVX512-NEXT:    pushq %r15
3615; AVX512-NEXT:    pushq %r14
3616; AVX512-NEXT:    pushq %rbx
3617; AVX512-NEXT:    subq $96, %rsp
3618; AVX512-NEXT:    vmovupd %zmm0, (%rsp) # 64-byte Spill
3619; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3620; AVX512-NEXT:    vzeroupper
3621; AVX512-NEXT:    callq __truncdfhf2
3622; AVX512-NEXT:    movl %eax, %ebx
3623; AVX512-NEXT:    shll $16, %ebx
3624; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
3625; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3626; AVX512-NEXT:    vzeroupper
3627; AVX512-NEXT:    callq __truncdfhf2
3628; AVX512-NEXT:    movzwl %ax, %r15d
3629; AVX512-NEXT:    orl %ebx, %r15d
3630; AVX512-NEXT:    vmovupd (%rsp), %zmm0 # 64-byte Reload
3631; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3632; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3633; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3634; AVX512-NEXT:    vzeroupper
3635; AVX512-NEXT:    callq __truncdfhf2
3636; AVX512-NEXT:    movl %eax, %ebx
3637; AVX512-NEXT:    shll $16, %ebx
3638; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3639; AVX512-NEXT:    callq __truncdfhf2
3640; AVX512-NEXT:    movzwl %ax, %r14d
3641; AVX512-NEXT:    orl %ebx, %r14d
3642; AVX512-NEXT:    shlq $32, %r14
3643; AVX512-NEXT:    orq %r15, %r14
3644; AVX512-NEXT:    vmovupd (%rsp), %zmm0 # 64-byte Reload
3645; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
3646; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3647; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3648; AVX512-NEXT:    vzeroupper
3649; AVX512-NEXT:    callq __truncdfhf2
3650; AVX512-NEXT:    movl %eax, %ebx
3651; AVX512-NEXT:    shll $16, %ebx
3652; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3653; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3654; AVX512-NEXT:    vzeroupper
3655; AVX512-NEXT:    callq __truncdfhf2
3656; AVX512-NEXT:    movzwl %ax, %r15d
3657; AVX512-NEXT:    orl %ebx, %r15d
3658; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3659; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3660; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3661; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3662; AVX512-NEXT:    vzeroupper
3663; AVX512-NEXT:    callq __truncdfhf2
3664; AVX512-NEXT:    movl %eax, %ebx
3665; AVX512-NEXT:    shll $16, %ebx
3666; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3667; AVX512-NEXT:    callq __truncdfhf2
3668; AVX512-NEXT:    movzwl %ax, %eax
3669; AVX512-NEXT:    orl %ebx, %eax
3670; AVX512-NEXT:    shlq $32, %rax
3671; AVX512-NEXT:    orq %r15, %rax
3672; AVX512-NEXT:    vmovq %rax, %xmm0
3673; AVX512-NEXT:    vmovq %r14, %xmm1
3674; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3675; AVX512-NEXT:    addq $96, %rsp
3676; AVX512-NEXT:    popq %rbx
3677; AVX512-NEXT:    popq %r14
3678; AVX512-NEXT:    popq %r15
3679; AVX512-NEXT:    retq
3680  %1 = fptrunc <8 x double> %a0 to <8 x half>
3681  %2 = bitcast <8 x half> %1 to <8 x i16>
3682  ret <8 x i16> %2
3683}
3684
3685;
3686; Double to Half (Store)
3687;
3688
3689define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind {
3690; ALL-LABEL: store_cvt_f64_to_i16:
3691; ALL:       # %bb.0:
3692; ALL-NEXT:    pushq %rbx
3693; ALL-NEXT:    movq %rdi, %rbx
3694; ALL-NEXT:    callq __truncdfhf2
3695; ALL-NEXT:    movw %ax, (%rbx)
3696; ALL-NEXT:    popq %rbx
3697; ALL-NEXT:    retq
3698  %1 = fptrunc double %a0 to half
3699  %2 = bitcast half %1 to i16
3700  store i16 %2, i16* %a1
3701  ret void
3702}
3703
3704define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind {
3705; ALL-LABEL: store_cvt_2f64_to_2i16:
3706; ALL:       # %bb.0:
3707; ALL-NEXT:    pushq %rbp
3708; ALL-NEXT:    pushq %rbx
3709; ALL-NEXT:    subq $24, %rsp
3710; ALL-NEXT:    movq %rdi, %rbx
3711; ALL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3712; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3713; ALL-NEXT:    callq __truncdfhf2
3714; ALL-NEXT:    movl %eax, %ebp
3715; ALL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3716; ALL-NEXT:    callq __truncdfhf2
3717; ALL-NEXT:    movw %ax, (%rbx)
3718; ALL-NEXT:    movw %bp, 2(%rbx)
3719; ALL-NEXT:    addq $24, %rsp
3720; ALL-NEXT:    popq %rbx
3721; ALL-NEXT:    popq %rbp
3722; ALL-NEXT:    retq
3723  %1 = fptrunc <2 x double> %a0 to <2 x half>
3724  %2 = bitcast <2 x half> %1 to <2 x i16>
3725  store <2 x i16> %2, <2 x i16>* %a1
3726  ret void
3727}
3728
3729define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
3730; AVX1-LABEL: store_cvt_4f64_to_4i16:
3731; AVX1:       # %bb.0:
3732; AVX1-NEXT:    pushq %rbp
3733; AVX1-NEXT:    pushq %r15
3734; AVX1-NEXT:    pushq %r14
3735; AVX1-NEXT:    pushq %rbx
3736; AVX1-NEXT:    subq $88, %rsp
3737; AVX1-NEXT:    movq %rdi, %rbx
3738; AVX1-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3739; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3740; AVX1-NEXT:    vzeroupper
3741; AVX1-NEXT:    callq __truncdfhf2
3742; AVX1-NEXT:    movl %eax, %r14d
3743; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3744; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3745; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3746; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3747; AVX1-NEXT:    vzeroupper
3748; AVX1-NEXT:    callq __truncdfhf2
3749; AVX1-NEXT:    movl %eax, %r15d
3750; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3751; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3752; AVX1-NEXT:    vzeroupper
3753; AVX1-NEXT:    callq __truncdfhf2
3754; AVX1-NEXT:    movl %eax, %ebp
3755; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3756; AVX1-NEXT:    callq __truncdfhf2
3757; AVX1-NEXT:    movw %ax, 4(%rbx)
3758; AVX1-NEXT:    movw %bp, (%rbx)
3759; AVX1-NEXT:    movw %r15w, 6(%rbx)
3760; AVX1-NEXT:    movw %r14w, 2(%rbx)
3761; AVX1-NEXT:    addq $88, %rsp
3762; AVX1-NEXT:    popq %rbx
3763; AVX1-NEXT:    popq %r14
3764; AVX1-NEXT:    popq %r15
3765; AVX1-NEXT:    popq %rbp
3766; AVX1-NEXT:    retq
3767;
3768; AVX2-LABEL: store_cvt_4f64_to_4i16:
3769; AVX2:       # %bb.0:
3770; AVX2-NEXT:    pushq %rbp
3771; AVX2-NEXT:    pushq %r15
3772; AVX2-NEXT:    pushq %r14
3773; AVX2-NEXT:    pushq %rbx
3774; AVX2-NEXT:    subq $88, %rsp
3775; AVX2-NEXT:    movq %rdi, %rbx
3776; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3777; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3778; AVX2-NEXT:    vzeroupper
3779; AVX2-NEXT:    callq __truncdfhf2
3780; AVX2-NEXT:    movl %eax, %r14d
3781; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3782; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3783; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3784; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3785; AVX2-NEXT:    vzeroupper
3786; AVX2-NEXT:    callq __truncdfhf2
3787; AVX2-NEXT:    movl %eax, %r15d
3788; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3789; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3790; AVX2-NEXT:    vzeroupper
3791; AVX2-NEXT:    callq __truncdfhf2
3792; AVX2-NEXT:    movl %eax, %ebp
3793; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3794; AVX2-NEXT:    callq __truncdfhf2
3795; AVX2-NEXT:    movw %ax, 4(%rbx)
3796; AVX2-NEXT:    movw %bp, (%rbx)
3797; AVX2-NEXT:    movw %r15w, 6(%rbx)
3798; AVX2-NEXT:    movw %r14w, 2(%rbx)
3799; AVX2-NEXT:    addq $88, %rsp
3800; AVX2-NEXT:    popq %rbx
3801; AVX2-NEXT:    popq %r14
3802; AVX2-NEXT:    popq %r15
3803; AVX2-NEXT:    popq %rbp
3804; AVX2-NEXT:    retq
3805;
3806; AVX512-LABEL: store_cvt_4f64_to_4i16:
3807; AVX512:       # %bb.0:
3808; AVX512-NEXT:    pushq %rbp
3809; AVX512-NEXT:    pushq %r15
3810; AVX512-NEXT:    pushq %r14
3811; AVX512-NEXT:    pushq %rbx
3812; AVX512-NEXT:    subq $88, %rsp
3813; AVX512-NEXT:    movq %rdi, %rbx
3814; AVX512-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3815; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3816; AVX512-NEXT:    vzeroupper
3817; AVX512-NEXT:    callq __truncdfhf2
3818; AVX512-NEXT:    movl %eax, %r14d
3819; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3820; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3821; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3822; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3823; AVX512-NEXT:    vzeroupper
3824; AVX512-NEXT:    callq __truncdfhf2
3825; AVX512-NEXT:    movl %eax, %r15d
3826; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3827; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3828; AVX512-NEXT:    vzeroupper
3829; AVX512-NEXT:    callq __truncdfhf2
3830; AVX512-NEXT:    movl %eax, %ebp
3831; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3832; AVX512-NEXT:    callq __truncdfhf2
3833; AVX512-NEXT:    movw %ax, 4(%rbx)
3834; AVX512-NEXT:    movw %bp, (%rbx)
3835; AVX512-NEXT:    movw %r15w, 6(%rbx)
3836; AVX512-NEXT:    movw %r14w, 2(%rbx)
3837; AVX512-NEXT:    addq $88, %rsp
3838; AVX512-NEXT:    popq %rbx
3839; AVX512-NEXT:    popq %r14
3840; AVX512-NEXT:    popq %r15
3841; AVX512-NEXT:    popq %rbp
3842; AVX512-NEXT:    retq
3843  %1 = fptrunc <4 x double> %a0 to <4 x half>
3844  %2 = bitcast <4 x half> %1 to <4 x i16>
3845  store <4 x i16> %2, <4 x i16>* %a1
3846  ret void
3847}
3848
3849define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind {
3850; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
3851; AVX1:       # %bb.0:
3852; AVX1-NEXT:    pushq %rbp
3853; AVX1-NEXT:    pushq %r14
3854; AVX1-NEXT:    pushq %rbx
3855; AVX1-NEXT:    subq $32, %rsp
3856; AVX1-NEXT:    movq %rdi, %r14
3857; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3858; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3859; AVX1-NEXT:    vzeroupper
3860; AVX1-NEXT:    callq __truncdfhf2
3861; AVX1-NEXT:    movl %eax, %ebp
3862; AVX1-NEXT:    shll $16, %ebp
3863; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3864; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3865; AVX1-NEXT:    vzeroupper
3866; AVX1-NEXT:    callq __truncdfhf2
3867; AVX1-NEXT:    movzwl %ax, %ebx
3868; AVX1-NEXT:    orl %ebp, %ebx
3869; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3870; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
3871; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3872; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3873; AVX1-NEXT:    vzeroupper
3874; AVX1-NEXT:    callq __truncdfhf2
3875; AVX1-NEXT:    movl %eax, %ebp
3876; AVX1-NEXT:    shll $16, %ebp
3877; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3878; AVX1-NEXT:    callq __truncdfhf2
3879; AVX1-NEXT:    movzwl %ax, %eax
3880; AVX1-NEXT:    orl %ebp, %eax
3881; AVX1-NEXT:    shlq $32, %rax
3882; AVX1-NEXT:    orq %rbx, %rax
3883; AVX1-NEXT:    vmovq %rax, %xmm0
3884; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3885; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
3886; AVX1-NEXT:    addq $32, %rsp
3887; AVX1-NEXT:    popq %rbx
3888; AVX1-NEXT:    popq %r14
3889; AVX1-NEXT:    popq %rbp
3890; AVX1-NEXT:    retq
3891;
3892; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
3893; AVX2:       # %bb.0:
3894; AVX2-NEXT:    pushq %rbp
3895; AVX2-NEXT:    pushq %r14
3896; AVX2-NEXT:    pushq %rbx
3897; AVX2-NEXT:    subq $32, %rsp
3898; AVX2-NEXT:    movq %rdi, %r14
3899; AVX2-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3900; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3901; AVX2-NEXT:    vzeroupper
3902; AVX2-NEXT:    callq __truncdfhf2
3903; AVX2-NEXT:    movl %eax, %ebp
3904; AVX2-NEXT:    shll $16, %ebp
3905; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3906; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3907; AVX2-NEXT:    vzeroupper
3908; AVX2-NEXT:    callq __truncdfhf2
3909; AVX2-NEXT:    movzwl %ax, %ebx
3910; AVX2-NEXT:    orl %ebp, %ebx
3911; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3912; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
3913; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3914; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3915; AVX2-NEXT:    vzeroupper
3916; AVX2-NEXT:    callq __truncdfhf2
3917; AVX2-NEXT:    movl %eax, %ebp
3918; AVX2-NEXT:    shll $16, %ebp
3919; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3920; AVX2-NEXT:    callq __truncdfhf2
3921; AVX2-NEXT:    movzwl %ax, %eax
3922; AVX2-NEXT:    orl %ebp, %eax
3923; AVX2-NEXT:    shlq $32, %rax
3924; AVX2-NEXT:    orq %rbx, %rax
3925; AVX2-NEXT:    vmovq %rax, %xmm0
3926; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3927; AVX2-NEXT:    vmovdqa %xmm0, (%r14)
3928; AVX2-NEXT:    addq $32, %rsp
3929; AVX2-NEXT:    popq %rbx
3930; AVX2-NEXT:    popq %r14
3931; AVX2-NEXT:    popq %rbp
3932; AVX2-NEXT:    retq
3933;
3934; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
3935; AVX512:       # %bb.0:
3936; AVX512-NEXT:    pushq %rbp
3937; AVX512-NEXT:    pushq %r14
3938; AVX512-NEXT:    pushq %rbx
3939; AVX512-NEXT:    subq $32, %rsp
3940; AVX512-NEXT:    movq %rdi, %r14
3941; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3942; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3943; AVX512-NEXT:    vzeroupper
3944; AVX512-NEXT:    callq __truncdfhf2
3945; AVX512-NEXT:    movl %eax, %ebp
3946; AVX512-NEXT:    shll $16, %ebp
3947; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3948; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3949; AVX512-NEXT:    vzeroupper
3950; AVX512-NEXT:    callq __truncdfhf2
3951; AVX512-NEXT:    movzwl %ax, %ebx
3952; AVX512-NEXT:    orl %ebp, %ebx
3953; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
3954; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
3955; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
3956; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3957; AVX512-NEXT:    vzeroupper
3958; AVX512-NEXT:    callq __truncdfhf2
3959; AVX512-NEXT:    movl %eax, %ebp
3960; AVX512-NEXT:    shll $16, %ebp
3961; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3962; AVX512-NEXT:    callq __truncdfhf2
3963; AVX512-NEXT:    movzwl %ax, %eax
3964; AVX512-NEXT:    orl %ebp, %eax
3965; AVX512-NEXT:    shlq $32, %rax
3966; AVX512-NEXT:    orq %rbx, %rax
3967; AVX512-NEXT:    vmovq %rax, %xmm0
3968; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3969; AVX512-NEXT:    vmovdqa %xmm0, (%r14)
3970; AVX512-NEXT:    addq $32, %rsp
3971; AVX512-NEXT:    popq %rbx
3972; AVX512-NEXT:    popq %r14
3973; AVX512-NEXT:    popq %rbp
3974; AVX512-NEXT:    retq
3975  %1 = fptrunc <4 x double> %a0 to <4 x half>
3976  %2 = bitcast <4 x half> %1 to <4 x i16>
3977  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3978  store <8 x i16> %3, <8 x i16>* %a1
3979  ret void
3980}
3981
3982define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind {
3983; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
3984; AVX1:       # %bb.0:
3985; AVX1-NEXT:    pushq %rbp
3986; AVX1-NEXT:    pushq %r14
3987; AVX1-NEXT:    pushq %rbx
3988; AVX1-NEXT:    subq $32, %rsp
3989; AVX1-NEXT:    movq %rdi, %r14
3990; AVX1-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
3991; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3992; AVX1-NEXT:    vzeroupper
3993; AVX1-NEXT:    callq __truncdfhf2
3994; AVX1-NEXT:    movl %eax, %ebp
3995; AVX1-NEXT:    shll $16, %ebp
3996; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
3997; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3998; AVX1-NEXT:    vzeroupper
3999; AVX1-NEXT:    callq __truncdfhf2
4000; AVX1-NEXT:    movzwl %ax, %ebx
4001; AVX1-NEXT:    orl %ebp, %ebx
4002; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4003; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4004; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4005; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4006; AVX1-NEXT:    vzeroupper
4007; AVX1-NEXT:    callq __truncdfhf2
4008; AVX1-NEXT:    movl %eax, %ebp
4009; AVX1-NEXT:    shll $16, %ebp
4010; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4011; AVX1-NEXT:    callq __truncdfhf2
4012; AVX1-NEXT:    movzwl %ax, %eax
4013; AVX1-NEXT:    orl %ebp, %eax
4014; AVX1-NEXT:    shlq $32, %rax
4015; AVX1-NEXT:    orq %rbx, %rax
4016; AVX1-NEXT:    vmovq %rax, %xmm0
4017; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4018; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
4019; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
4020; AVX1-NEXT:    addq $32, %rsp
4021; AVX1-NEXT:    popq %rbx
4022; AVX1-NEXT:    popq %r14
4023; AVX1-NEXT:    popq %rbp
4024; AVX1-NEXT:    retq
4025;
4026; AVX2-SLOW-LABEL: store_cvt_4f64_to_8i16_zero:
4027; AVX2-SLOW:       # %bb.0:
4028; AVX2-SLOW-NEXT:    pushq %rbp
4029; AVX2-SLOW-NEXT:    pushq %r14
4030; AVX2-SLOW-NEXT:    pushq %rbx
4031; AVX2-SLOW-NEXT:    subq $32, %rsp
4032; AVX2-SLOW-NEXT:    movq %rdi, %r14
4033; AVX2-SLOW-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4034; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4035; AVX2-SLOW-NEXT:    vzeroupper
4036; AVX2-SLOW-NEXT:    callq __truncdfhf2
4037; AVX2-SLOW-NEXT:    movl %eax, %ebp
4038; AVX2-SLOW-NEXT:    shll $16, %ebp
4039; AVX2-SLOW-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4040; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4041; AVX2-SLOW-NEXT:    vzeroupper
4042; AVX2-SLOW-NEXT:    callq __truncdfhf2
4043; AVX2-SLOW-NEXT:    movzwl %ax, %ebx
4044; AVX2-SLOW-NEXT:    orl %ebp, %ebx
4045; AVX2-SLOW-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4046; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
4047; AVX2-SLOW-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4048; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4049; AVX2-SLOW-NEXT:    vzeroupper
4050; AVX2-SLOW-NEXT:    callq __truncdfhf2
4051; AVX2-SLOW-NEXT:    movl %eax, %ebp
4052; AVX2-SLOW-NEXT:    shll $16, %ebp
4053; AVX2-SLOW-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4054; AVX2-SLOW-NEXT:    callq __truncdfhf2
4055; AVX2-SLOW-NEXT:    movzwl %ax, %eax
4056; AVX2-SLOW-NEXT:    orl %ebp, %eax
4057; AVX2-SLOW-NEXT:    shlq $32, %rax
4058; AVX2-SLOW-NEXT:    orq %rbx, %rax
4059; AVX2-SLOW-NEXT:    vmovq %rax, %xmm0
4060; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4061; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
4062; AVX2-SLOW-NEXT:    vmovdqa %xmm0, (%r14)
4063; AVX2-SLOW-NEXT:    addq $32, %rsp
4064; AVX2-SLOW-NEXT:    popq %rbx
4065; AVX2-SLOW-NEXT:    popq %r14
4066; AVX2-SLOW-NEXT:    popq %rbp
4067; AVX2-SLOW-NEXT:    retq
4068;
4069; AVX2-FAST-LABEL: store_cvt_4f64_to_8i16_zero:
4070; AVX2-FAST:       # %bb.0:
4071; AVX2-FAST-NEXT:    pushq %rbp
4072; AVX2-FAST-NEXT:    pushq %r14
4073; AVX2-FAST-NEXT:    pushq %rbx
4074; AVX2-FAST-NEXT:    subq $32, %rsp
4075; AVX2-FAST-NEXT:    movq %rdi, %r14
4076; AVX2-FAST-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4077; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4078; AVX2-FAST-NEXT:    vzeroupper
4079; AVX2-FAST-NEXT:    callq __truncdfhf2
4080; AVX2-FAST-NEXT:    movl %eax, %ebp
4081; AVX2-FAST-NEXT:    shll $16, %ebp
4082; AVX2-FAST-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4083; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4084; AVX2-FAST-NEXT:    vzeroupper
4085; AVX2-FAST-NEXT:    callq __truncdfhf2
4086; AVX2-FAST-NEXT:    movzwl %ax, %ebx
4087; AVX2-FAST-NEXT:    orl %ebp, %ebx
4088; AVX2-FAST-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4089; AVX2-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
4090; AVX2-FAST-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4091; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4092; AVX2-FAST-NEXT:    vzeroupper
4093; AVX2-FAST-NEXT:    callq __truncdfhf2
4094; AVX2-FAST-NEXT:    movl %eax, %ebp
4095; AVX2-FAST-NEXT:    shll $16, %ebp
4096; AVX2-FAST-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4097; AVX2-FAST-NEXT:    callq __truncdfhf2
4098; AVX2-FAST-NEXT:    movzwl %ax, %eax
4099; AVX2-FAST-NEXT:    orl %ebp, %eax
4100; AVX2-FAST-NEXT:    shlq $32, %rax
4101; AVX2-FAST-NEXT:    orq %rbx, %rax
4102; AVX2-FAST-NEXT:    vmovq %rax, %xmm0
4103; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
4104; AVX2-FAST-NEXT:    vmovdqa %xmm0, (%r14)
4105; AVX2-FAST-NEXT:    addq $32, %rsp
4106; AVX2-FAST-NEXT:    popq %rbx
4107; AVX2-FAST-NEXT:    popq %r14
4108; AVX2-FAST-NEXT:    popq %rbp
4109; AVX2-FAST-NEXT:    retq
4110;
4111; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero:
4112; AVX512F:       # %bb.0:
4113; AVX512F-NEXT:    pushq %rbp
4114; AVX512F-NEXT:    pushq %r14
4115; AVX512F-NEXT:    pushq %rbx
4116; AVX512F-NEXT:    subq $32, %rsp
4117; AVX512F-NEXT:    movq %rdi, %r14
4118; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4119; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4120; AVX512F-NEXT:    vzeroupper
4121; AVX512F-NEXT:    callq __truncdfhf2
4122; AVX512F-NEXT:    movl %eax, %ebp
4123; AVX512F-NEXT:    shll $16, %ebp
4124; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4125; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4126; AVX512F-NEXT:    vzeroupper
4127; AVX512F-NEXT:    callq __truncdfhf2
4128; AVX512F-NEXT:    movzwl %ax, %ebx
4129; AVX512F-NEXT:    orl %ebp, %ebx
4130; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4131; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
4132; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4133; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4134; AVX512F-NEXT:    vzeroupper
4135; AVX512F-NEXT:    callq __truncdfhf2
4136; AVX512F-NEXT:    movl %eax, %ebp
4137; AVX512F-NEXT:    shll $16, %ebp
4138; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4139; AVX512F-NEXT:    callq __truncdfhf2
4140; AVX512F-NEXT:    movzwl %ax, %eax
4141; AVX512F-NEXT:    orl %ebp, %eax
4142; AVX512F-NEXT:    shlq $32, %rax
4143; AVX512F-NEXT:    orq %rbx, %rax
4144; AVX512F-NEXT:    vmovq %rax, %xmm0
4145; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4146; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
4147; AVX512F-NEXT:    vmovdqa %xmm0, (%r14)
4148; AVX512F-NEXT:    addq $32, %rsp
4149; AVX512F-NEXT:    popq %rbx
4150; AVX512F-NEXT:    popq %r14
4151; AVX512F-NEXT:    popq %rbp
4152; AVX512F-NEXT:    retq
4153;
4154; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero:
4155; AVX512VL:       # %bb.0:
4156; AVX512VL-NEXT:    pushq %rbp
4157; AVX512VL-NEXT:    pushq %r14
4158; AVX512VL-NEXT:    pushq %rbx
4159; AVX512VL-NEXT:    subq $32, %rsp
4160; AVX512VL-NEXT:    movq %rdi, %r14
4161; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
4162; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4163; AVX512VL-NEXT:    vzeroupper
4164; AVX512VL-NEXT:    callq __truncdfhf2
4165; AVX512VL-NEXT:    movl %eax, %ebp
4166; AVX512VL-NEXT:    shll $16, %ebp
4167; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
4168; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4169; AVX512VL-NEXT:    vzeroupper
4170; AVX512VL-NEXT:    callq __truncdfhf2
4171; AVX512VL-NEXT:    movzwl %ax, %ebx
4172; AVX512VL-NEXT:    orl %ebp, %ebx
4173; AVX512VL-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
4174; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
4175; AVX512VL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
4176; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4177; AVX512VL-NEXT:    vzeroupper
4178; AVX512VL-NEXT:    callq __truncdfhf2
4179; AVX512VL-NEXT:    movl %eax, %ebp
4180; AVX512VL-NEXT:    shll $16, %ebp
4181; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4182; AVX512VL-NEXT:    callq __truncdfhf2
4183; AVX512VL-NEXT:    movzwl %ax, %eax
4184; AVX512VL-NEXT:    orl %ebp, %eax
4185; AVX512VL-NEXT:    shlq $32, %rax
4186; AVX512VL-NEXT:    orq %rbx, %rax
4187; AVX512VL-NEXT:    vmovq %rax, %xmm0
4188; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
4189; AVX512VL-NEXT:    vmovdqa %xmm0, (%r14)
4190; AVX512VL-NEXT:    addq $32, %rsp
4191; AVX512VL-NEXT:    popq %rbx
4192; AVX512VL-NEXT:    popq %r14
4193; AVX512VL-NEXT:    popq %rbp
4194; AVX512VL-NEXT:    retq
4195  %1 = fptrunc <4 x double> %a0 to <4 x half>
4196  %2 = bitcast <4 x half> %1 to <4 x i16>
4197  %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4198  store <8 x i16> %3, <8 x i16>* %a1
4199  ret void
4200}
4201
4202define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
4203; AVX1-LABEL: store_cvt_8f64_to_8i16:
4204; AVX1:       # %bb.0:
4205; AVX1-NEXT:    pushq %rbp
4206; AVX1-NEXT:    pushq %r15
4207; AVX1-NEXT:    pushq %r14
4208; AVX1-NEXT:    pushq %r13
4209; AVX1-NEXT:    pushq %r12
4210; AVX1-NEXT:    pushq %rbx
4211; AVX1-NEXT:    subq $136, %rsp
4212; AVX1-NEXT:    movq %rdi, %rbx
4213; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4214; AVX1-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4215; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4216; AVX1-NEXT:    vzeroupper
4217; AVX1-NEXT:    callq __truncdfhf2
4218; AVX1-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4219; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4220; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4221; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4222; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4223; AVX1-NEXT:    vzeroupper
4224; AVX1-NEXT:    callq __truncdfhf2
4225; AVX1-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4226; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4227; AVX1-NEXT:    # xmm0 = mem[1,0]
4228; AVX1-NEXT:    callq __truncdfhf2
4229; AVX1-NEXT:    movl %eax, %r12d
4230; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4231; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
4232; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4233; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4234; AVX1-NEXT:    vzeroupper
4235; AVX1-NEXT:    callq __truncdfhf2
4236; AVX1-NEXT:    movl %eax, %r13d
4237; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4238; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4239; AVX1-NEXT:    vzeroupper
4240; AVX1-NEXT:    callq __truncdfhf2
4241; AVX1-NEXT:    movl %eax, %ebp
4242; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4243; AVX1-NEXT:    callq __truncdfhf2
4244; AVX1-NEXT:    movl %eax, %r14d
4245; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4246; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4247; AVX1-NEXT:    vzeroupper
4248; AVX1-NEXT:    callq __truncdfhf2
4249; AVX1-NEXT:    movl %eax, %r15d
4250; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4251; AVX1-NEXT:    callq __truncdfhf2
4252; AVX1-NEXT:    movw %ax, 12(%rbx)
4253; AVX1-NEXT:    movw %r15w, 8(%rbx)
4254; AVX1-NEXT:    movw %r14w, 4(%rbx)
4255; AVX1-NEXT:    movw %bp, (%rbx)
4256; AVX1-NEXT:    movw %r13w, 14(%rbx)
4257; AVX1-NEXT:    movw %r12w, 10(%rbx)
4258; AVX1-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
4259; AVX1-NEXT:    movw %ax, 6(%rbx)
4260; AVX1-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
4261; AVX1-NEXT:    movw %ax, 2(%rbx)
4262; AVX1-NEXT:    addq $136, %rsp
4263; AVX1-NEXT:    popq %rbx
4264; AVX1-NEXT:    popq %r12
4265; AVX1-NEXT:    popq %r13
4266; AVX1-NEXT:    popq %r14
4267; AVX1-NEXT:    popq %r15
4268; AVX1-NEXT:    popq %rbp
4269; AVX1-NEXT:    retq
4270;
4271; AVX2-LABEL: store_cvt_8f64_to_8i16:
4272; AVX2:       # %bb.0:
4273; AVX2-NEXT:    pushq %rbp
4274; AVX2-NEXT:    pushq %r15
4275; AVX2-NEXT:    pushq %r14
4276; AVX2-NEXT:    pushq %r13
4277; AVX2-NEXT:    pushq %r12
4278; AVX2-NEXT:    pushq %rbx
4279; AVX2-NEXT:    subq $136, %rsp
4280; AVX2-NEXT:    movq %rdi, %rbx
4281; AVX2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4282; AVX2-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4283; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4284; AVX2-NEXT:    vzeroupper
4285; AVX2-NEXT:    callq __truncdfhf2
4286; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4287; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4288; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4289; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4290; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4291; AVX2-NEXT:    vzeroupper
4292; AVX2-NEXT:    callq __truncdfhf2
4293; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4294; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4295; AVX2-NEXT:    # xmm0 = mem[1,0]
4296; AVX2-NEXT:    callq __truncdfhf2
4297; AVX2-NEXT:    movl %eax, %r12d
4298; AVX2-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4299; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
4300; AVX2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4301; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4302; AVX2-NEXT:    vzeroupper
4303; AVX2-NEXT:    callq __truncdfhf2
4304; AVX2-NEXT:    movl %eax, %r13d
4305; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4306; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4307; AVX2-NEXT:    vzeroupper
4308; AVX2-NEXT:    callq __truncdfhf2
4309; AVX2-NEXT:    movl %eax, %ebp
4310; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4311; AVX2-NEXT:    callq __truncdfhf2
4312; AVX2-NEXT:    movl %eax, %r14d
4313; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4314; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4315; AVX2-NEXT:    vzeroupper
4316; AVX2-NEXT:    callq __truncdfhf2
4317; AVX2-NEXT:    movl %eax, %r15d
4318; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4319; AVX2-NEXT:    callq __truncdfhf2
4320; AVX2-NEXT:    movw %ax, 12(%rbx)
4321; AVX2-NEXT:    movw %r15w, 8(%rbx)
4322; AVX2-NEXT:    movw %r14w, 4(%rbx)
4323; AVX2-NEXT:    movw %bp, (%rbx)
4324; AVX2-NEXT:    movw %r13w, 14(%rbx)
4325; AVX2-NEXT:    movw %r12w, 10(%rbx)
4326; AVX2-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
4327; AVX2-NEXT:    movw %ax, 6(%rbx)
4328; AVX2-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
4329; AVX2-NEXT:    movw %ax, 2(%rbx)
4330; AVX2-NEXT:    addq $136, %rsp
4331; AVX2-NEXT:    popq %rbx
4332; AVX2-NEXT:    popq %r12
4333; AVX2-NEXT:    popq %r13
4334; AVX2-NEXT:    popq %r14
4335; AVX2-NEXT:    popq %r15
4336; AVX2-NEXT:    popq %rbp
4337; AVX2-NEXT:    retq
4338;
4339; AVX512-LABEL: store_cvt_8f64_to_8i16:
4340; AVX512:       # %bb.0:
4341; AVX512-NEXT:    pushq %rbp
4342; AVX512-NEXT:    pushq %r15
4343; AVX512-NEXT:    pushq %r14
4344; AVX512-NEXT:    pushq %r13
4345; AVX512-NEXT:    pushq %r12
4346; AVX512-NEXT:    pushq %rbx
4347; AVX512-NEXT:    subq $200, %rsp
4348; AVX512-NEXT:    movq %rdi, %rbx
4349; AVX512-NEXT:    vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4350; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4351; AVX512-NEXT:    vzeroupper
4352; AVX512-NEXT:    callq __truncdfhf2
4353; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4354; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4355; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
4356; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4357; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4358; AVX512-NEXT:    vzeroupper
4359; AVX512-NEXT:    callq __truncdfhf2
4360; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4361; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4362; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
4363; AVX512-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4364; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4365; AVX512-NEXT:    vzeroupper
4366; AVX512-NEXT:    callq __truncdfhf2
4367; AVX512-NEXT:    movl %eax, %r12d
4368; AVX512-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4369; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
4370; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4371; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
4372; AVX512-NEXT:    vzeroupper
4373; AVX512-NEXT:    callq __truncdfhf2
4374; AVX512-NEXT:    movl %eax, %r13d
4375; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
4376; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
4377; AVX512-NEXT:    vzeroupper
4378; AVX512-NEXT:    callq __truncdfhf2
4379; AVX512-NEXT:    movl %eax, %ebp
4380; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4381; AVX512-NEXT:    callq __truncdfhf2
4382; AVX512-NEXT:    movl %eax, %r14d
4383; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4384; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4385; AVX512-NEXT:    vzeroupper
4386; AVX512-NEXT:    callq __truncdfhf2
4387; AVX512-NEXT:    movl %eax, %r15d
4388; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4389; AVX512-NEXT:    callq __truncdfhf2
4390; AVX512-NEXT:    movw %ax, 12(%rbx)
4391; AVX512-NEXT:    movw %r15w, 8(%rbx)
4392; AVX512-NEXT:    movw %r14w, 4(%rbx)
4393; AVX512-NEXT:    movw %bp, (%rbx)
4394; AVX512-NEXT:    movw %r13w, 14(%rbx)
4395; AVX512-NEXT:    movw %r12w, 10(%rbx)
4396; AVX512-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
4397; AVX512-NEXT:    movw %ax, 6(%rbx)
4398; AVX512-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
4399; AVX512-NEXT:    movw %ax, 2(%rbx)
4400; AVX512-NEXT:    addq $200, %rsp
4401; AVX512-NEXT:    popq %rbx
4402; AVX512-NEXT:    popq %r12
4403; AVX512-NEXT:    popq %r13
4404; AVX512-NEXT:    popq %r14
4405; AVX512-NEXT:    popq %r15
4406; AVX512-NEXT:    popq %rbp
4407; AVX512-NEXT:    retq
4408  %1 = fptrunc <8 x double> %a0 to <8 x half>
4409  %2 = bitcast <8 x half> %1 to <8 x i16>
4410  store <8 x i16> %2, <8 x i16>* %a1
4411  ret void
4412}
4413