• Home
  • Raw
  • Download

Lines Matching refs:AVX512

4 …wn -mattr=+avx512f,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
185 ; AVX512-LABEL: cvt_8i16_to_8f32:
186 ; AVX512: # BB#0:
187 ; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
188 ; AVX512-NEXT: movq %rdx, %r8
189 ; AVX512-NEXT: movq %rdx, %r10
190 ; AVX512-NEXT: movswl %dx, %r9d
191 ; AVX512-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
192 ; AVX512-NEXT: shrl $16, %edx
193 ; AVX512-NEXT: shrq $32, %r8
194 ; AVX512-NEXT: shrq $48, %r10
195 ; AVX512-NEXT: vmovq %xmm0, %rdi
196 ; AVX512-NEXT: movq %rdi, %rax
197 ; AVX512-NEXT: movq %rdi, %rsi
198 ; AVX512-NEXT: movswl %di, %ecx
199 ; AVX512-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
200 ; AVX512-NEXT: shrl $16, %edi
201 ; AVX512-NEXT: shrq $32, %rax
202 ; AVX512-NEXT: shrq $48, %rsi
203 ; AVX512-NEXT: movswl %si, %esi
204 ; AVX512-NEXT: vmovd %esi, %xmm0
205 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
206 ; AVX512-NEXT: cwtl
207 ; AVX512-NEXT: vmovd %eax, %xmm1
208 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
209 ; AVX512-NEXT: movswl %di, %eax
210 ; AVX512-NEXT: vmovd %eax, %xmm2
211 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
212 ; AVX512-NEXT: vmovd %ecx, %xmm3
213 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
214 ; AVX512-NEXT: movswl %r10w, %eax
215 ; AVX512-NEXT: vmovd %eax, %xmm4
216 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
217 ; AVX512-NEXT: movswl %r8w, %eax
218 ; AVX512-NEXT: vmovd %eax, %xmm5
219 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
220 ; AVX512-NEXT: movswl %dx, %eax
221 ; AVX512-NEXT: vmovd %eax, %xmm6
222 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
223 ; AVX512-NEXT: vmovd %r9d, %xmm7
224 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
225 ; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
226 ; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
227 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
228 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
229 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
230 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
231 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
232 ; AVX512-NEXT: retq
429 ; AVX512-LABEL: cvt_16i16_to_16f32:
430 ; AVX512: # BB#0:
431 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10
432 ; AVX512-NEXT: vmovq %xmm0, %rax
433 ; AVX512-NEXT: movq %rax, %rcx
434 ; AVX512-NEXT: shrq $48, %rcx
435 ; AVX512-NEXT: movswl %cx, %ecx
436 ; AVX512-NEXT: vmovd %ecx, %xmm8
437 ; AVX512-NEXT: movq %rax, %rcx
438 ; AVX512-NEXT: shrq $32, %rcx
439 ; AVX512-NEXT: movswl %cx, %ecx
440 ; AVX512-NEXT: vmovd %ecx, %xmm9
441 ; AVX512-NEXT: movswl %ax, %ecx
442 ; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
443 ; AVX512-NEXT: shrl $16, %eax
444 ; AVX512-NEXT: cwtl
445 ; AVX512-NEXT: vmovd %eax, %xmm11
446 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
447 ; AVX512-NEXT: vmovd %ecx, %xmm12
448 ; AVX512-NEXT: movq %rax, %rcx
449 ; AVX512-NEXT: shrq $48, %rcx
450 ; AVX512-NEXT: movswl %cx, %ecx
451 ; AVX512-NEXT: vmovd %ecx, %xmm13
452 ; AVX512-NEXT: movq %rax, %rcx
453 ; AVX512-NEXT: shrq $32, %rcx
454 ; AVX512-NEXT: movswl %cx, %ecx
455 ; AVX512-NEXT: vmovd %ecx, %xmm14
456 ; AVX512-NEXT: movswl %ax, %ecx
457 ; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
458 ; AVX512-NEXT: shrl $16, %eax
459 ; AVX512-NEXT: cwtl
460 ; AVX512-NEXT: vmovd %eax, %xmm15
461 ; AVX512-NEXT: vmovq %xmm10, %rax
462 ; AVX512-NEXT: vmovd %ecx, %xmm2
463 ; AVX512-NEXT: movq %rax, %rcx
464 ; AVX512-NEXT: shrq $48, %rcx
465 ; AVX512-NEXT: movswl %cx, %ecx
466 ; AVX512-NEXT: vmovd %ecx, %xmm3
467 ; AVX512-NEXT: movq %rax, %rcx
468 ; AVX512-NEXT: shrq $32, %rcx
469 ; AVX512-NEXT: movswl %cx, %ecx
470 ; AVX512-NEXT: vmovd %ecx, %xmm1
471 ; AVX512-NEXT: movswl %ax, %ecx
472 ; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
473 ; AVX512-NEXT: shrl $16, %eax
474 ; AVX512-NEXT: cwtl
475 ; AVX512-NEXT: vmovd %eax, %xmm4
476 ; AVX512-NEXT: vpextrq $1, %xmm10, %rax
477 ; AVX512-NEXT: vmovd %ecx, %xmm10
478 ; AVX512-NEXT: movq %rax, %rcx
479 ; AVX512-NEXT: shrq $48, %rcx
480 ; AVX512-NEXT: movswl %cx, %ecx
481 ; AVX512-NEXT: vmovd %ecx, %xmm5
482 ; AVX512-NEXT: movq %rax, %rcx
483 ; AVX512-NEXT: shrq $32, %rcx
484 ; AVX512-NEXT: movswl %cx, %ecx
485 ; AVX512-NEXT: vmovd %ecx, %xmm6
486 ; AVX512-NEXT: movl %eax, %ecx
487 ; AVX512-NEXT: shrl $16, %ecx
488 ; AVX512-NEXT: movswl %cx, %ecx
489 ; AVX512-NEXT: vmovd %ecx, %xmm7
490 ; AVX512-NEXT: cwtl
491 ; AVX512-NEXT: vmovd %eax, %xmm0
492 ; AVX512-NEXT: vcvtph2ps %xmm8, %xmm8
493 ; AVX512-NEXT: vcvtph2ps %xmm9, %xmm9
494 ; AVX512-NEXT: vcvtph2ps %xmm11, %xmm11
495 ; AVX512-NEXT: vcvtph2ps %xmm12, %xmm12
496 ; AVX512-NEXT: vcvtph2ps %xmm13, %xmm13
497 ; AVX512-NEXT: vcvtph2ps %xmm14, %xmm14
498 ; AVX512-NEXT: vcvtph2ps %xmm15, %xmm15
499 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
500 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
501 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
502 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
503 ; AVX512-NEXT: vcvtph2ps %xmm10, %xmm10
504 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
505 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
506 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
507 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
508 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
509 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
510 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
511 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
512 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
513 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
514 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
515 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
516 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
517 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
518 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
519 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
520 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
521 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
522 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
523 ; AVX512-NEXT: retq
675 ; AVX512-LABEL: load_cvt_8i16_to_8f32:
676 ; AVX512: # BB#0:
677 ; AVX512-NEXT: movswl 6(%rdi), %eax
678 ; AVX512-NEXT: vmovd %eax, %xmm0
679 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
680 ; AVX512-NEXT: movswl 4(%rdi), %eax
681 ; AVX512-NEXT: vmovd %eax, %xmm1
682 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
683 ; AVX512-NEXT: movswl (%rdi), %eax
684 ; AVX512-NEXT: vmovd %eax, %xmm2
685 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
686 ; AVX512-NEXT: movswl 2(%rdi), %eax
687 ; AVX512-NEXT: vmovd %eax, %xmm3
688 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
689 ; AVX512-NEXT: movswl 14(%rdi), %eax
690 ; AVX512-NEXT: vmovd %eax, %xmm4
691 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
692 ; AVX512-NEXT: movswl 12(%rdi), %eax
693 ; AVX512-NEXT: vmovd %eax, %xmm5
694 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
695 ; AVX512-NEXT: movswl 8(%rdi), %eax
696 ; AVX512-NEXT: vmovd %eax, %xmm6
697 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
698 ; AVX512-NEXT: movswl 10(%rdi), %eax
699 ; AVX512-NEXT: vmovd %eax, %xmm7
700 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
701 ; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
702 ; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
703 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
704 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
705 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
706 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
707 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
708 ; AVX512-NEXT: retq
848 ; AVX512-LABEL: load_cvt_16i16_to_16f32:
849 ; AVX512: # BB#0:
850 ; AVX512-NEXT: movswl 6(%rdi), %eax
851 ; AVX512-NEXT: vmovd %eax, %xmm0
852 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm8
853 ; AVX512-NEXT: movswl 4(%rdi), %eax
854 ; AVX512-NEXT: vmovd %eax, %xmm0
855 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm9
856 ; AVX512-NEXT: movswl (%rdi), %eax
857 ; AVX512-NEXT: vmovd %eax, %xmm0
858 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm10
859 ; AVX512-NEXT: movswl 2(%rdi), %eax
860 ; AVX512-NEXT: vmovd %eax, %xmm0
861 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm11
862 ; AVX512-NEXT: movswl 14(%rdi), %eax
863 ; AVX512-NEXT: vmovd %eax, %xmm0
864 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm12
865 ; AVX512-NEXT: movswl 12(%rdi), %eax
866 ; AVX512-NEXT: vmovd %eax, %xmm0
867 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm13
868 ; AVX512-NEXT: movswl 8(%rdi), %eax
869 ; AVX512-NEXT: vmovd %eax, %xmm0
870 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm14
871 ; AVX512-NEXT: movswl 10(%rdi), %eax
872 ; AVX512-NEXT: vmovd %eax, %xmm0
873 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm15
874 ; AVX512-NEXT: movswl 22(%rdi), %eax
875 ; AVX512-NEXT: vmovd %eax, %xmm0
876 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
877 ; AVX512-NEXT: movswl 20(%rdi), %eax
878 ; AVX512-NEXT: vmovd %eax, %xmm1
879 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
880 ; AVX512-NEXT: movswl 16(%rdi), %eax
881 ; AVX512-NEXT: vmovd %eax, %xmm2
882 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
883 ; AVX512-NEXT: movswl 18(%rdi), %eax
884 ; AVX512-NEXT: vmovd %eax, %xmm3
885 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
886 ; AVX512-NEXT: movswl 30(%rdi), %eax
887 ; AVX512-NEXT: vmovd %eax, %xmm4
888 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
889 ; AVX512-NEXT: movswl 28(%rdi), %eax
890 ; AVX512-NEXT: vmovd %eax, %xmm5
891 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
892 ; AVX512-NEXT: movswl 24(%rdi), %eax
893 ; AVX512-NEXT: vmovd %eax, %xmm6
894 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
895 ; AVX512-NEXT: movswl 26(%rdi), %eax
896 ; AVX512-NEXT: vmovd %eax, %xmm7
897 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
898 ; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
899 ; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
900 ; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
901 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
902 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
903 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
904 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
905 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
906 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
907 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
908 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
909 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
910 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
911 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
912 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
913 ; AVX512-NEXT: retq
1159 ; AVX512-LABEL: cvt_8i16_to_8f64:
1160 ; AVX512: # BB#0:
1161 ; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
1162 ; AVX512-NEXT: movq %rdx, %r8
1163 ; AVX512-NEXT: movl %edx, %r10d
1164 ; AVX512-NEXT: movswl %dx, %r9d
1165 ; AVX512-NEXT: shrq $48, %rdx
1166 ; AVX512-NEXT: shrq $32, %r8
1167 ; AVX512-NEXT: shrl $16, %r10d
1168 ; AVX512-NEXT: vmovq %xmm0, %rdi
1169 ; AVX512-NEXT: movq %rdi, %rax
1170 ; AVX512-NEXT: movl %edi, %esi
1171 ; AVX512-NEXT: movswl %di, %ecx
1172 ; AVX512-NEXT: shrq $48, %rdi
1173 ; AVX512-NEXT: shrq $32, %rax
1174 ; AVX512-NEXT: shrl $16, %esi
1175 ; AVX512-NEXT: movswl %si, %esi
1176 ; AVX512-NEXT: vmovd %esi, %xmm0
1177 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
1178 ; AVX512-NEXT: vmovd %ecx, %xmm1
1179 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
1180 ; AVX512-NEXT: cwtl
1181 ; AVX512-NEXT: vmovd %eax, %xmm2
1182 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1183 ; AVX512-NEXT: movswl %di, %eax
1184 ; AVX512-NEXT: vmovd %eax, %xmm3
1185 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1186 ; AVX512-NEXT: movswl %r10w, %eax
1187 ; AVX512-NEXT: vmovd %eax, %xmm4
1188 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
1189 ; AVX512-NEXT: vmovd %r9d, %xmm5
1190 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1191 ; AVX512-NEXT: movswl %r8w, %eax
1192 ; AVX512-NEXT: vmovd %eax, %xmm6
1193 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
1194 ; AVX512-NEXT: movswl %dx, %eax
1195 ; AVX512-NEXT: vmovd %eax, %xmm7
1196 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
1197 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1198 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1199 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1200 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1201 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1202 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
1203 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
1204 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1205 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1206 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1207 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1208 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1209 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1210 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1211 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
1212 ; AVX512-NEXT: retq
1405 ; AVX512-LABEL: load_cvt_8i16_to_8f64:
1406 ; AVX512: # BB#0:
1407 ; AVX512-NEXT: movswl (%rdi), %eax
1408 ; AVX512-NEXT: vmovd %eax, %xmm0
1409 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
1410 ; AVX512-NEXT: movswl 2(%rdi), %eax
1411 ; AVX512-NEXT: vmovd %eax, %xmm1
1412 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
1413 ; AVX512-NEXT: movswl 4(%rdi), %eax
1414 ; AVX512-NEXT: vmovd %eax, %xmm2
1415 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1416 ; AVX512-NEXT: movswl 6(%rdi), %eax
1417 ; AVX512-NEXT: vmovd %eax, %xmm3
1418 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1419 ; AVX512-NEXT: movswl 8(%rdi), %eax
1420 ; AVX512-NEXT: vmovd %eax, %xmm4
1421 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
1422 ; AVX512-NEXT: movswl 10(%rdi), %eax
1423 ; AVX512-NEXT: vmovd %eax, %xmm5
1424 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1425 ; AVX512-NEXT: movswl 12(%rdi), %eax
1426 ; AVX512-NEXT: vmovd %eax, %xmm6
1427 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
1428 ; AVX512-NEXT: movswl 14(%rdi), %eax
1429 ; AVX512-NEXT: vmovd %eax, %xmm7
1430 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
1431 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1432 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1433 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1434 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1435 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1436 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
1437 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
1438 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1439 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1440 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1441 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1442 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1443 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1444 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1445 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
1446 ; AVX512-NEXT: retq
1655 ; AVX512-LABEL: cvt_8f32_to_8i16:
1656 ; AVX512: # BB#0:
1657 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1658 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1659 ; AVX512-NEXT: vmovd %xmm1, %eax
1660 ; AVX512-NEXT: shll $16, %eax
1661 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1662 ; AVX512-NEXT: vmovd %xmm1, %ecx
1663 ; AVX512-NEXT: movzwl %cx, %ecx
1664 ; AVX512-NEXT: orl %eax, %ecx
1665 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1666 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1667 ; AVX512-NEXT: vmovd %xmm1, %edx
1668 ; AVX512-NEXT: shll $16, %edx
1669 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1670 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1671 ; AVX512-NEXT: vmovd %xmm1, %eax
1672 ; AVX512-NEXT: movzwl %ax, %eax
1673 ; AVX512-NEXT: orl %edx, %eax
1674 ; AVX512-NEXT: shlq $32, %rax
1675 ; AVX512-NEXT: orq %rcx, %rax
1676 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
1677 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1678 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1679 ; AVX512-NEXT: vmovd %xmm1, %ecx
1680 ; AVX512-NEXT: shll $16, %ecx
1681 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1682 ; AVX512-NEXT: vmovd %xmm1, %edx
1683 ; AVX512-NEXT: movzwl %dx, %edx
1684 ; AVX512-NEXT: orl %ecx, %edx
1685 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1686 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1687 ; AVX512-NEXT: vmovd %xmm1, %ecx
1688 ; AVX512-NEXT: shll $16, %ecx
1689 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1690 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1691 ; AVX512-NEXT: vmovd %xmm0, %esi
1692 ; AVX512-NEXT: movzwl %si, %esi
1693 ; AVX512-NEXT: orl %ecx, %esi
1694 ; AVX512-NEXT: shlq $32, %rsi
1695 ; AVX512-NEXT: orq %rdx, %rsi
1696 ; AVX512-NEXT: vmovq %rsi, %xmm0
1697 ; AVX512-NEXT: vmovq %rax, %xmm1
1698 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1699 ; AVX512-NEXT: retq
1840 ; AVX512-LABEL: cvt_16f32_to_16i16:
1841 ; AVX512: # BB#0:
1842 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
1843 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
1844 ; AVX512-NEXT: vmovd %xmm2, %eax
1845 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1846 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1847 ; AVX512-NEXT: vmovd %eax, %xmm3
1848 ; AVX512-NEXT: vmovd %xmm2, %eax
1849 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1850 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1851 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1852 ; AVX512-NEXT: vmovd %xmm2, %eax
1853 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
1854 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1855 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1856 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1857 ; AVX512-NEXT: vmovd %xmm1, %eax
1858 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
1859 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1860 ; AVX512-NEXT: vmovd %xmm1, %eax
1861 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1862 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1863 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1864 ; AVX512-NEXT: vmovd %xmm1, %eax
1865 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1866 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1867 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1868 ; AVX512-NEXT: vmovd %xmm1, %eax
1869 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1870 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1871 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1872 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
1873 ; AVX512-NEXT: vmovd %xmm2, %eax
1874 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
1875 ; AVX512-NEXT: vmovd %xmm1, %eax
1876 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1877 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1878 ; AVX512-NEXT: vmovd %eax, %xmm3
1879 ; AVX512-NEXT: vmovd %xmm1, %eax
1880 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1881 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1882 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1883 ; AVX512-NEXT: vmovd %xmm1, %eax
1884 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1885 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1886 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1887 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1888 ; AVX512-NEXT: vmovd %xmm0, %eax
1889 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1890 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1891 ; AVX512-NEXT: vmovd %xmm0, %eax
1892 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1893 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1894 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1895 ; AVX512-NEXT: vmovd %xmm0, %eax
1896 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
1897 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1898 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1899 ; AVX512-NEXT: vmovd %xmm0, %eax
1900 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1901 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1902 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
1903 ; AVX512-NEXT: vmovd %xmm0, %eax
1904 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
1905 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1906 ; AVX512-NEXT: retq
2093 ; AVX512-LABEL: store_cvt_8f32_to_8i16:
2094 ; AVX512: # BB#0:
2095 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2096 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2097 ; AVX512-NEXT: vmovd %xmm1, %r8d
2098 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2099 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2100 ; AVX512-NEXT: vmovd %xmm1, %r9d
2101 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2102 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2103 ; AVX512-NEXT: vmovd %xmm1, %r10d
2104 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
2105 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2106 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2107 ; AVX512-NEXT: vmovd %xmm2, %r11d
2108 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2109 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2110 ; AVX512-NEXT: vmovd %xmm2, %eax
2111 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2112 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2113 ; AVX512-NEXT: vmovd %xmm2, %ecx
2114 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2115 ; AVX512-NEXT: vmovd %xmm0, %edx
2116 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
2117 ; AVX512-NEXT: vmovd %xmm0, %esi
2118 ; AVX512-NEXT: movw %si, 8(%rdi)
2119 ; AVX512-NEXT: movw %dx, (%rdi)
2120 ; AVX512-NEXT: movw %cx, 14(%rdi)
2121 ; AVX512-NEXT: movw %ax, 12(%rdi)
2122 ; AVX512-NEXT: movw %r11w, 10(%rdi)
2123 ; AVX512-NEXT: movw %r10w, 6(%rdi)
2124 ; AVX512-NEXT: movw %r9w, 4(%rdi)
2125 ; AVX512-NEXT: movw %r8w, 2(%rdi)
2126 ; AVX512-NEXT: retq
2268 ; AVX512-LABEL: store_cvt_16f32_to_16i16:
2269 ; AVX512: # BB#0:
2270 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
2271 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
2272 ; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
2273 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
2274 ; AVX512-NEXT: vmovd %xmm4, %eax
2275 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
2276 ; AVX512-NEXT: movw %ax, 24(%rdi)
2277 ; AVX512-NEXT: vmovd %xmm4, %eax
2278 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
2279 ; AVX512-NEXT: movw %ax, 16(%rdi)
2280 ; AVX512-NEXT: vmovd %xmm4, %eax
2281 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
2282 ; AVX512-NEXT: movw %ax, 8(%rdi)
2283 ; AVX512-NEXT: vmovd %xmm4, %eax
2284 ; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2285 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2286 ; AVX512-NEXT: movw %ax, (%rdi)
2287 ; AVX512-NEXT: vmovd %xmm4, %eax
2288 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2289 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2290 ; AVX512-NEXT: movw %ax, 30(%rdi)
2291 ; AVX512-NEXT: vmovd %xmm4, %eax
2292 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2293 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2294 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2295 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2296 ; AVX512-NEXT: movw %ax, 28(%rdi)
2297 ; AVX512-NEXT: vmovd %xmm3, %eax
2298 ; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
2299 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2300 ; AVX512-NEXT: movw %ax, 26(%rdi)
2301 ; AVX512-NEXT: vmovd %xmm3, %eax
2302 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
2303 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2304 ; AVX512-NEXT: movw %ax, 22(%rdi)
2305 ; AVX512-NEXT: vmovd %xmm3, %eax
2306 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2307 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2308 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2309 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2310 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
2311 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2312 ; AVX512-NEXT: movw %ax, 20(%rdi)
2313 ; AVX512-NEXT: vmovd %xmm2, %eax
2314 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2315 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2316 ; AVX512-NEXT: movw %ax, 18(%rdi)
2317 ; AVX512-NEXT: vmovd %xmm2, %eax
2318 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2319 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2320 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
2321 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2322 ; AVX512-NEXT: movw %ax, 14(%rdi)
2323 ; AVX512-NEXT: vmovd %xmm1, %eax
2324 ; AVX512-NEXT: movw %ax, 12(%rdi)
2325 ; AVX512-NEXT: vmovd %xmm2, %eax
2326 ; AVX512-NEXT: movw %ax, 10(%rdi)
2327 ; AVX512-NEXT: vmovd %xmm0, %eax
2328 ; AVX512-NEXT: movw %ax, 6(%rdi)
2329 ; AVX512-NEXT: vmovd %xmm3, %eax
2330 ; AVX512-NEXT: movw %ax, 4(%rdi)
2331 ; AVX512-NEXT: vmovd %xmm4, %eax
2332 ; AVX512-NEXT: movw %ax, 2(%rdi)
2333 ; AVX512-NEXT: retq
2477 ; AVX512-LABEL: cvt_4f64_to_4i16:
2478 ; AVX512: # BB#0:
2479 ; AVX512-NEXT: pushq %r14
2480 ; AVX512-NEXT: .Ltmp3:
2481 ; AVX512-NEXT: .cfi_def_cfa_offset 16
2482 ; AVX512-NEXT: pushq %rbx
2483 ; AVX512-NEXT: .Ltmp4:
2484 ; AVX512-NEXT: .cfi_def_cfa_offset 24
2485 ; AVX512-NEXT: subq $40, %rsp
2486 ; AVX512-NEXT: .Ltmp5:
2487 ; AVX512-NEXT: .cfi_def_cfa_offset 64
2488 ; AVX512-NEXT: .Ltmp6:
2489 ; AVX512-NEXT: .cfi_offset %rbx, -24
2490 ; AVX512-NEXT: .Ltmp7:
2491 ; AVX512-NEXT: .cfi_offset %r14, -16
2492 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2493 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2494 ; AVX512-NEXT: callq __truncdfhf2
2495 ; AVX512-NEXT: movw %ax, %bx
2496 ; AVX512-NEXT: shll $16, %ebx
2497 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2498 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2499 ; AVX512-NEXT: callq __truncdfhf2
2500 ; AVX512-NEXT: movzwl %ax, %r14d
2501 ; AVX512-NEXT: orl %ebx, %r14d
2502 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2503 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2504 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2505 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2506 ; AVX512-NEXT: callq __truncdfhf2
2507 ; AVX512-NEXT: movw %ax, %bx
2508 ; AVX512-NEXT: shll $16, %ebx
2509 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2510 ; AVX512-NEXT: callq __truncdfhf2
2511 ; AVX512-NEXT: movzwl %ax, %eax
2512 ; AVX512-NEXT: orl %ebx, %eax
2513 ; AVX512-NEXT: shlq $32, %rax
2514 ; AVX512-NEXT: orq %r14, %rax
2515 ; AVX512-NEXT: vmovq %rax, %xmm0
2516 ; AVX512-NEXT: addq $40, %rsp
2517 ; AVX512-NEXT: popq %rbx
2518 ; AVX512-NEXT: popq %r14
2519 ; AVX512-NEXT: retq
2622 ; AVX512-LABEL: cvt_4f64_to_8i16_undef:
2623 ; AVX512: # BB#0:
2624 ; AVX512-NEXT: pushq %r14
2625 ; AVX512-NEXT: .Ltmp8:
2626 ; AVX512-NEXT: .cfi_def_cfa_offset 16
2627 ; AVX512-NEXT: pushq %rbx
2628 ; AVX512-NEXT: .Ltmp9:
2629 ; AVX512-NEXT: .cfi_def_cfa_offset 24
2630 ; AVX512-NEXT: subq $40, %rsp
2631 ; AVX512-NEXT: .Ltmp10:
2632 ; AVX512-NEXT: .cfi_def_cfa_offset 64
2633 ; AVX512-NEXT: .Ltmp11:
2634 ; AVX512-NEXT: .cfi_offset %rbx, -24
2635 ; AVX512-NEXT: .Ltmp12:
2636 ; AVX512-NEXT: .cfi_offset %r14, -16
2637 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2638 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2639 ; AVX512-NEXT: callq __truncdfhf2
2640 ; AVX512-NEXT: movw %ax, %bx
2641 ; AVX512-NEXT: shll $16, %ebx
2642 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2643 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2644 ; AVX512-NEXT: callq __truncdfhf2
2645 ; AVX512-NEXT: movzwl %ax, %r14d
2646 ; AVX512-NEXT: orl %ebx, %r14d
2647 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2648 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2649 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2650 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2651 ; AVX512-NEXT: callq __truncdfhf2
2652 ; AVX512-NEXT: movw %ax, %bx
2653 ; AVX512-NEXT: shll $16, %ebx
2654 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2655 ; AVX512-NEXT: callq __truncdfhf2
2656 ; AVX512-NEXT: movzwl %ax, %eax
2657 ; AVX512-NEXT: orl %ebx, %eax
2658 ; AVX512-NEXT: shlq $32, %rax
2659 ; AVX512-NEXT: orq %r14, %rax
2660 ; AVX512-NEXT: vmovq %rax, %xmm0
2661 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2662 ; AVX512-NEXT: addq $40, %rsp
2663 ; AVX512-NEXT: popq %rbx
2664 ; AVX512-NEXT: popq %r14
2665 ; AVX512-NEXT: retq
2769 ; AVX512-LABEL: cvt_4f64_to_8i16_zero:
2770 ; AVX512: # BB#0:
2771 ; AVX512-NEXT: pushq %r14
2772 ; AVX512-NEXT: .Ltmp13:
2773 ; AVX512-NEXT: .cfi_def_cfa_offset 16
2774 ; AVX512-NEXT: pushq %rbx
2775 ; AVX512-NEXT: .Ltmp14:
2776 ; AVX512-NEXT: .cfi_def_cfa_offset 24
2777 ; AVX512-NEXT: subq $40, %rsp
2778 ; AVX512-NEXT: .Ltmp15:
2779 ; AVX512-NEXT: .cfi_def_cfa_offset 64
2780 ; AVX512-NEXT: .Ltmp16:
2781 ; AVX512-NEXT: .cfi_offset %rbx, -24
2782 ; AVX512-NEXT: .Ltmp17:
2783 ; AVX512-NEXT: .cfi_offset %r14, -16
2784 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2785 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2786 ; AVX512-NEXT: callq __truncdfhf2
2787 ; AVX512-NEXT: movw %ax, %bx
2788 ; AVX512-NEXT: shll $16, %ebx
2789 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2790 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2791 ; AVX512-NEXT: callq __truncdfhf2
2792 ; AVX512-NEXT: movzwl %ax, %r14d
2793 ; AVX512-NEXT: orl %ebx, %r14d
2794 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2795 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2796 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2797 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2798 ; AVX512-NEXT: callq __truncdfhf2
2799 ; AVX512-NEXT: movw %ax, %bx
2800 ; AVX512-NEXT: shll $16, %ebx
2801 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2802 ; AVX512-NEXT: callq __truncdfhf2
2803 ; AVX512-NEXT: movzwl %ax, %eax
2804 ; AVX512-NEXT: orl %ebx, %eax
2805 ; AVX512-NEXT: shlq $32, %rax
2806 ; AVX512-NEXT: orq %r14, %rax
2807 ; AVX512-NEXT: vmovq %rax, %xmm0
2808 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zer…
2809 ; AVX512-NEXT: addq $40, %rsp
2810 ; AVX512-NEXT: popq %rbx
2811 ; AVX512-NEXT: popq %r14
2812 ; AVX512-NEXT: retq
2984 ; AVX512-LABEL: cvt_8f64_to_8i16:
2985 ; AVX512: # BB#0:
2986 ; AVX512-NEXT: pushq %r15
2987 ; AVX512-NEXT: .Ltmp18:
2988 ; AVX512-NEXT: .cfi_def_cfa_offset 16
2989 ; AVX512-NEXT: pushq %r14
2990 ; AVX512-NEXT: .Ltmp19:
2991 ; AVX512-NEXT: .cfi_def_cfa_offset 24
2992 ; AVX512-NEXT: pushq %rbx
2993 ; AVX512-NEXT: .Ltmp20:
2994 ; AVX512-NEXT: .cfi_def_cfa_offset 32
2995 ; AVX512-NEXT: subq $96, %rsp
2996 ; AVX512-NEXT: .Ltmp21:
2997 ; AVX512-NEXT: .cfi_def_cfa_offset 128
2998 ; AVX512-NEXT: .Ltmp22:
2999 ; AVX512-NEXT: .cfi_offset %rbx, -32
3000 ; AVX512-NEXT: .Ltmp23:
3001 ; AVX512-NEXT: .cfi_offset %r14, -24
3002 ; AVX512-NEXT: .Ltmp24:
3003 ; AVX512-NEXT: .cfi_offset %r15, -16
3004 ; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
3005 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3006 ; AVX512-NEXT: callq __truncdfhf2
3007 ; AVX512-NEXT: movw %ax, %bx
3008 ; AVX512-NEXT: shll $16, %ebx
3009 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
3010 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
3011 ; AVX512-NEXT: callq __truncdfhf2
3012 ; AVX512-NEXT: movzwl %ax, %r15d
3013 ; AVX512-NEXT: orl %ebx, %r15d
3014 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
3015 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3016 ; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3017 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3018 ; AVX512-NEXT: callq __truncdfhf2
3019 ; AVX512-NEXT: movw %ax, %bx
3020 ; AVX512-NEXT: shll $16, %ebx
3021 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3022 ; AVX512-NEXT: callq __truncdfhf2
3023 ; AVX512-NEXT: movzwl %ax, %r14d
3024 ; AVX512-NEXT: orl %ebx, %r14d
3025 ; AVX512-NEXT: shlq $32, %r14
3026 ; AVX512-NEXT: orq %r15, %r14
3027 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
3028 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3029 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3030 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3031 ; AVX512-NEXT: callq __truncdfhf2
3032 ; AVX512-NEXT: movw %ax, %bx
3033 ; AVX512-NEXT: shll $16, %ebx
3034 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3035 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3036 ; AVX512-NEXT: callq __truncdfhf2
3037 ; AVX512-NEXT: movzwl %ax, %r15d
3038 ; AVX512-NEXT: orl %ebx, %r15d
3039 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3040 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3041 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3042 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3043 ; AVX512-NEXT: callq __truncdfhf2
3044 ; AVX512-NEXT: movw %ax, %bx
3045 ; AVX512-NEXT: shll $16, %ebx
3046 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3047 ; AVX512-NEXT: callq __truncdfhf2
3048 ; AVX512-NEXT: movzwl %ax, %eax
3049 ; AVX512-NEXT: orl %ebx, %eax
3050 ; AVX512-NEXT: shlq $32, %rax
3051 ; AVX512-NEXT: orq %r15, %rax
3052 ; AVX512-NEXT: vmovq %rax, %xmm0
3053 ; AVX512-NEXT: vmovq %r14, %xmm1
3054 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3055 ; AVX512-NEXT: addq $96, %rsp
3056 ; AVX512-NEXT: popq %rbx
3057 ; AVX512-NEXT: popq %r14
3058 ; AVX512-NEXT: popq %r15
3059 ; AVX512-NEXT: retq
3236 ; AVX512-LABEL: store_cvt_4f64_to_4i16:
3237 ; AVX512: # BB#0:
3238 ; AVX512-NEXT: pushq %rbp
3239 ; AVX512-NEXT: .Ltmp32:
3240 ; AVX512-NEXT: .cfi_def_cfa_offset 16
3241 ; AVX512-NEXT: pushq %r15
3242 ; AVX512-NEXT: .Ltmp33:
3243 ; AVX512-NEXT: .cfi_def_cfa_offset 24
3244 ; AVX512-NEXT: pushq %r14
3245 ; AVX512-NEXT: .Ltmp34:
3246 ; AVX512-NEXT: .cfi_def_cfa_offset 32
3247 ; AVX512-NEXT: pushq %rbx
3248 ; AVX512-NEXT: .Ltmp35:
3249 ; AVX512-NEXT: .cfi_def_cfa_offset 40
3250 ; AVX512-NEXT: subq $88, %rsp
3251 ; AVX512-NEXT: .Ltmp36:
3252 ; AVX512-NEXT: .cfi_def_cfa_offset 128
3253 ; AVX512-NEXT: .Ltmp37:
3254 ; AVX512-NEXT: .cfi_offset %rbx, -40
3255 ; AVX512-NEXT: .Ltmp38:
3256 ; AVX512-NEXT: .cfi_offset %r14, -32
3257 ; AVX512-NEXT: .Ltmp39:
3258 ; AVX512-NEXT: .cfi_offset %r15, -24
3259 ; AVX512-NEXT: .Ltmp40:
3260 ; AVX512-NEXT: .cfi_offset %rbp, -16
3261 ; AVX512-NEXT: movq %rdi, %rbx
3262 ; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3263 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3264 ; AVX512-NEXT: callq __truncdfhf2
3265 ; AVX512-NEXT: movl %eax, %r14d
3266 ; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3267 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3268 ; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3269 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3270 ; AVX512-NEXT: callq __truncdfhf2
3271 ; AVX512-NEXT: movl %eax, %r15d
3272 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3273 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3274 ; AVX512-NEXT: callq __truncdfhf2
3275 ; AVX512-NEXT: movl %eax, %ebp
3276 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3277 ; AVX512-NEXT: callq __truncdfhf2
3278 ; AVX512-NEXT: movw %ax, 4(%rbx)
3279 ; AVX512-NEXT: movw %bp, (%rbx)
3280 ; AVX512-NEXT: movw %r15w, 6(%rbx)
3281 ; AVX512-NEXT: movw %r14w, 2(%rbx)
3282 ; AVX512-NEXT: addq $88, %rsp
3283 ; AVX512-NEXT: popq %rbx
3284 ; AVX512-NEXT: popq %r14
3285 ; AVX512-NEXT: popq %r15
3286 ; AVX512-NEXT: popq %rbp
3287 ; AVX512-NEXT: retq
3407 ; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
3408 ; AVX512: # BB#0:
3409 ; AVX512-NEXT: pushq %rbp
3410 ; AVX512-NEXT: .Ltmp41:
3411 ; AVX512-NEXT: .cfi_def_cfa_offset 16
3412 ; AVX512-NEXT: pushq %r14
3413 ; AVX512-NEXT: .Ltmp42:
3414 ; AVX512-NEXT: .cfi_def_cfa_offset 24
3415 ; AVX512-NEXT: pushq %rbx
3416 ; AVX512-NEXT: .Ltmp43:
3417 ; AVX512-NEXT: .cfi_def_cfa_offset 32
3418 ; AVX512-NEXT: subq $32, %rsp
3419 ; AVX512-NEXT: .Ltmp44:
3420 ; AVX512-NEXT: .cfi_def_cfa_offset 64
3421 ; AVX512-NEXT: .Ltmp45:
3422 ; AVX512-NEXT: .cfi_offset %rbx, -32
3423 ; AVX512-NEXT: .Ltmp46:
3424 ; AVX512-NEXT: .cfi_offset %r14, -24
3425 ; AVX512-NEXT: .Ltmp47:
3426 ; AVX512-NEXT: .cfi_offset %rbp, -16
3427 ; AVX512-NEXT: movq %rdi, %r14
3428 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3429 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3430 ; AVX512-NEXT: callq __truncdfhf2
3431 ; AVX512-NEXT: movw %ax, %bp
3432 ; AVX512-NEXT: shll $16, %ebp
3433 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3434 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3435 ; AVX512-NEXT: callq __truncdfhf2
3436 ; AVX512-NEXT: movzwl %ax, %ebx
3437 ; AVX512-NEXT: orl %ebp, %ebx
3438 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3439 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3440 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3441 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3442 ; AVX512-NEXT: callq __truncdfhf2
3443 ; AVX512-NEXT: movw %ax, %bp
3444 ; AVX512-NEXT: shll $16, %ebp
3445 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3446 ; AVX512-NEXT: callq __truncdfhf2
3447 ; AVX512-NEXT: movzwl %ax, %eax
3448 ; AVX512-NEXT: orl %ebp, %eax
3449 ; AVX512-NEXT: shlq $32, %rax
3450 ; AVX512-NEXT: orq %rbx, %rax
3451 ; AVX512-NEXT: vmovq %rax, %xmm0
3452 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3453 ; AVX512-NEXT: vmovdqa %xmm0, (%r14)
3454 ; AVX512-NEXT: addq $32, %rsp
3455 ; AVX512-NEXT: popq %rbx
3456 ; AVX512-NEXT: popq %r14
3457 ; AVX512-NEXT: popq %rbp
3458 ; AVX512-NEXT: retq
3579 ; AVX512-LABEL: store_cvt_4f64_to_8i16_zero:
3580 ; AVX512: # BB#0:
3581 ; AVX512-NEXT: pushq %rbp
3582 ; AVX512-NEXT: .Ltmp48:
3583 ; AVX512-NEXT: .cfi_def_cfa_offset 16
3584 ; AVX512-NEXT: pushq %r14
3585 ; AVX512-NEXT: .Ltmp49:
3586 ; AVX512-NEXT: .cfi_def_cfa_offset 24
3587 ; AVX512-NEXT: pushq %rbx
3588 ; AVX512-NEXT: .Ltmp50:
3589 ; AVX512-NEXT: .cfi_def_cfa_offset 32
3590 ; AVX512-NEXT: subq $32, %rsp
3591 ; AVX512-NEXT: .Ltmp51:
3592 ; AVX512-NEXT: .cfi_def_cfa_offset 64
3593 ; AVX512-NEXT: .Ltmp52:
3594 ; AVX512-NEXT: .cfi_offset %rbx, -32
3595 ; AVX512-NEXT: .Ltmp53:
3596 ; AVX512-NEXT: .cfi_offset %r14, -24
3597 ; AVX512-NEXT: .Ltmp54:
3598 ; AVX512-NEXT: .cfi_offset %rbp, -16
3599 ; AVX512-NEXT: movq %rdi, %r14
3600 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3601 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3602 ; AVX512-NEXT: callq __truncdfhf2
3603 ; AVX512-NEXT: movw %ax, %bp
3604 ; AVX512-NEXT: shll $16, %ebp
3605 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3606 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3607 ; AVX512-NEXT: callq __truncdfhf2
3608 ; AVX512-NEXT: movzwl %ax, %ebx
3609 ; AVX512-NEXT: orl %ebp, %ebx
3610 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3611 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3612 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3613 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3614 ; AVX512-NEXT: callq __truncdfhf2
3615 ; AVX512-NEXT: movw %ax, %bp
3616 ; AVX512-NEXT: shll $16, %ebp
3617 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3618 ; AVX512-NEXT: callq __truncdfhf2
3619 ; AVX512-NEXT: movzwl %ax, %eax
3620 ; AVX512-NEXT: orl %ebp, %eax
3621 ; AVX512-NEXT: shlq $32, %rax
3622 ; AVX512-NEXT: orq %rbx, %rax
3623 ; AVX512-NEXT: vmovq %rax, %xmm0
3624 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zer…
3625 ; AVX512-NEXT: vmovdqa %xmm0, (%r14)
3626 ; AVX512-NEXT: addq $32, %rsp
3627 ; AVX512-NEXT: popq %rbx
3628 ; AVX512-NEXT: popq %r14
3629 ; AVX512-NEXT: popq %rbp
3630 ; AVX512-NEXT: retq
3829 ; AVX512-LABEL: store_cvt_8f64_to_8i16:
3830 ; AVX512: # BB#0:
3831 ; AVX512-NEXT: pushq %rbp
3832 ; AVX512-NEXT: .Ltmp55:
3833 ; AVX512-NEXT: .cfi_def_cfa_offset 16
3834 ; AVX512-NEXT: pushq %r15
3835 ; AVX512-NEXT: .Ltmp56:
3836 ; AVX512-NEXT: .cfi_def_cfa_offset 24
3837 ; AVX512-NEXT: pushq %r14
3838 ; AVX512-NEXT: .Ltmp57:
3839 ; AVX512-NEXT: .cfi_def_cfa_offset 32
3840 ; AVX512-NEXT: pushq %r13
3841 ; AVX512-NEXT: .Ltmp58:
3842 ; AVX512-NEXT: .cfi_def_cfa_offset 40
3843 ; AVX512-NEXT: pushq %r12
3844 ; AVX512-NEXT: .Ltmp59:
3845 ; AVX512-NEXT: .cfi_def_cfa_offset 48
3846 ; AVX512-NEXT: pushq %rbx
3847 ; AVX512-NEXT: .Ltmp60:
3848 ; AVX512-NEXT: .cfi_def_cfa_offset 56
3849 ; AVX512-NEXT: subq $200, %rsp
3850 ; AVX512-NEXT: .Ltmp61:
3851 ; AVX512-NEXT: .cfi_def_cfa_offset 256
3852 ; AVX512-NEXT: .Ltmp62:
3853 ; AVX512-NEXT: .cfi_offset %rbx, -56
3854 ; AVX512-NEXT: .Ltmp63:
3855 ; AVX512-NEXT: .cfi_offset %r12, -48
3856 ; AVX512-NEXT: .Ltmp64:
3857 ; AVX512-NEXT: .cfi_offset %r13, -40
3858 ; AVX512-NEXT: .Ltmp65:
3859 ; AVX512-NEXT: .cfi_offset %r14, -32
3860 ; AVX512-NEXT: .Ltmp66:
3861 ; AVX512-NEXT: .cfi_offset %r15, -24
3862 ; AVX512-NEXT: .Ltmp67:
3863 ; AVX512-NEXT: .cfi_offset %rbp, -16
3864 ; AVX512-NEXT: movq %rdi, %rbx
3865 ; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
3866 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3867 ; AVX512-NEXT: callq __truncdfhf2
3868 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3869 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
3870 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3871 ; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3872 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3873 ; AVX512-NEXT: callq __truncdfhf2
3874 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
3875 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
3876 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3877 ; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
3878 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3879 ; AVX512-NEXT: callq __truncdfhf2
3880 ; AVX512-NEXT: movl %eax, %r12d
3881 ; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3882 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3883 ; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
3884 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3885 ; AVX512-NEXT: callq __truncdfhf2
3886 ; AVX512-NEXT: movl %eax, %r13d
3887 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
3888 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
3889 ; AVX512-NEXT: callq __truncdfhf2
3890 ; AVX512-NEXT: movl %eax, %ebp
3891 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3892 ; AVX512-NEXT: callq __truncdfhf2
3893 ; AVX512-NEXT: movl %eax, %r14d
3894 ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
3895 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3896 ; AVX512-NEXT: callq __truncdfhf2
3897 ; AVX512-NEXT: movl %eax, %r15d
3898 ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
3899 ; AVX512-NEXT: callq __truncdfhf2
3900 ; AVX512-NEXT: movw %ax, 12(%rbx)
3901 ; AVX512-NEXT: movw %r15w, 8(%rbx)
3902 ; AVX512-NEXT: movw %r14w, 4(%rbx)
3903 ; AVX512-NEXT: movw %bp, (%rbx)
3904 ; AVX512-NEXT: movw %r13w, 14(%rbx)
3905 ; AVX512-NEXT: movw %r12w, 10(%rbx)
3906 ; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3907 ; AVX512-NEXT: movw %ax, 6(%rbx)
3908 ; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
3909 ; AVX512-NEXT: movw %ax, 2(%rbx)
3910 ; AVX512-NEXT: addq $200, %rsp
3911 ; AVX512-NEXT: popq %rbx
3912 ; AVX512-NEXT: popq %r12
3913 ; AVX512-NEXT: popq %r13
3914 ; AVX512-NEXT: popq %r14
3915 ; AVX512-NEXT: popq %r15
3916 ; AVX512-NEXT: popq %rbp
3917 ; AVX512-NEXT: retq