1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL 7 8; 9; Half to Float 10; 11 12define float @cvt_i16_to_f32(i16 %a0) nounwind { 13; ALL-LABEL: cvt_i16_to_f32: 14; ALL: # %bb.0: 15; ALL-NEXT: movswl %di, %eax 16; ALL-NEXT: vmovd %eax, %xmm0 17; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 18; ALL-NEXT: retq 19 %1 = bitcast i16 %a0 to half 20 %2 = fpext half %1 to float 21 ret float %2 22} 23 24define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { 25; AVX1-LABEL: cvt_4i16_to_4f32: 26; AVX1: # %bb.0: 27; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 28; AVX1-NEXT: vmovq %xmm0, %rax 29; AVX1-NEXT: movq %rax, %rcx 30; AVX1-NEXT: movq %rax, %rdx 31; AVX1-NEXT: movswl %ax, %esi 32; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 33; AVX1-NEXT: shrl $16, %eax 34; AVX1-NEXT: shrq $32, %rcx 35; AVX1-NEXT: shrq $48, %rdx 36; AVX1-NEXT: movswl %dx, %edx 37; AVX1-NEXT: vmovd %edx, %xmm0 38; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 39; AVX1-NEXT: movswl %cx, %ecx 40; AVX1-NEXT: vmovd %ecx, %xmm1 41; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 42; AVX1-NEXT: cwtl 43; AVX1-NEXT: vmovd %eax, %xmm2 44; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 45; AVX1-NEXT: vmovd %esi, %xmm3 46; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 47; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 48; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 49; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 50; AVX1-NEXT: retq 51; 52; AVX2-LABEL: cvt_4i16_to_4f32: 53; AVX2: # %bb.0: 54; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 55; AVX2-NEXT: vmovq %xmm0, %rax 56; AVX2-NEXT: movq %rax, %rcx 57; AVX2-NEXT: movq %rax, %rdx 58; AVX2-NEXT: movswl %ax, %esi 59; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 60; AVX2-NEXT: shrl $16, %eax 61; AVX2-NEXT: shrq $32, %rcx 62; AVX2-NEXT: shrq $48, %rdx 63; AVX2-NEXT: movswl %dx, %edx 64; AVX2-NEXT: vmovd %edx, %xmm0 65; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 66; AVX2-NEXT: movswl %cx, %ecx 67; AVX2-NEXT: vmovd %ecx, %xmm1 68; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 69; AVX2-NEXT: cwtl 70; AVX2-NEXT: vmovd %eax, %xmm2 71; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 72; AVX2-NEXT: vmovd %esi, %xmm3 73; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 74; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 75; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 76; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 77; AVX2-NEXT: retq 78; 79; AVX512F-LABEL: cvt_4i16_to_4f32: 80; AVX512F: # %bb.0: 81; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 82; AVX512F-NEXT: vmovq %xmm0, %rax 83; AVX512F-NEXT: movq %rax, %rcx 84; AVX512F-NEXT: movq %rax, %rdx 85; AVX512F-NEXT: movswl %ax, %esi 86; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 87; AVX512F-NEXT: shrl $16, %eax 88; AVX512F-NEXT: shrq $32, %rcx 89; AVX512F-NEXT: shrq $48, %rdx 90; AVX512F-NEXT: movswl %dx, %edx 91; AVX512F-NEXT: vmovd %edx, %xmm0 92; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 93; AVX512F-NEXT: movswl %cx, %ecx 94; AVX512F-NEXT: vmovd %ecx, %xmm1 95; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 96; AVX512F-NEXT: cwtl 97; AVX512F-NEXT: vmovd %eax, %xmm2 98; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 99; AVX512F-NEXT: vmovd %esi, %xmm3 100; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 101; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 102; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 103; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 104; AVX512F-NEXT: retq 105; 106; AVX512VL-LABEL: cvt_4i16_to_4f32: 107; AVX512VL: # %bb.0: 108; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 109; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 110; AVX512VL-NEXT: movq %rax, %rcx 111; AVX512VL-NEXT: movq %rax, %rdx 112; AVX512VL-NEXT: movswl %ax, %esi 113; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 114; AVX512VL-NEXT: shrl $16, %eax 115; AVX512VL-NEXT: shrq $32, %rcx 116; AVX512VL-NEXT: shrq $48, %rdx 117; AVX512VL-NEXT: movswl %dx, %edx 118; AVX512VL-NEXT: vmovd %edx, %xmm0 119; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 120; AVX512VL-NEXT: movswl %cx, %ecx 121; AVX512VL-NEXT: vmovd %ecx, %xmm1 122; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 123; AVX512VL-NEXT: cwtl 124; AVX512VL-NEXT: vmovd %eax, %xmm2 125; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 126; AVX512VL-NEXT: vmovd %esi, %xmm3 127; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 128; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 129; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 130; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 131; AVX512VL-NEXT: retq 132 %1 = bitcast <4 x i16> %a0 to <4 x half> 133 %2 = fpext <4 x half> %1 to <4 x float> 134 ret <4 x float> %2 135} 136 137define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { 138; AVX1-LABEL: cvt_8i16_to_4f32: 139; AVX1: # %bb.0: 140; AVX1-NEXT: vmovq %xmm0, %rax 141; AVX1-NEXT: movq %rax, %rcx 142; AVX1-NEXT: movq %rax, %rdx 143; AVX1-NEXT: movswl %ax, %esi 144; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 145; AVX1-NEXT: shrl $16, %eax 146; AVX1-NEXT: shrq $32, %rcx 147; AVX1-NEXT: shrq $48, %rdx 148; AVX1-NEXT: movswl %dx, %edx 149; AVX1-NEXT: vmovd %edx, %xmm0 150; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 151; AVX1-NEXT: movswl %cx, %ecx 152; AVX1-NEXT: vmovd %ecx, %xmm1 153; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 154; AVX1-NEXT: cwtl 155; AVX1-NEXT: vmovd %eax, %xmm2 156; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 157; AVX1-NEXT: vmovd %esi, %xmm3 158; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 159; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 160; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 161; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 162; AVX1-NEXT: retq 163; 164; AVX2-LABEL: cvt_8i16_to_4f32: 165; AVX2: # %bb.0: 166; AVX2-NEXT: vmovq %xmm0, %rax 167; AVX2-NEXT: movq %rax, %rcx 168; AVX2-NEXT: movq %rax, %rdx 169; AVX2-NEXT: movswl %ax, %esi 170; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 171; AVX2-NEXT: shrl $16, %eax 172; AVX2-NEXT: shrq $32, %rcx 173; AVX2-NEXT: shrq $48, %rdx 174; AVX2-NEXT: movswl %dx, %edx 175; AVX2-NEXT: vmovd %edx, %xmm0 176; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 177; AVX2-NEXT: movswl %cx, %ecx 178; AVX2-NEXT: vmovd %ecx, %xmm1 179; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 180; AVX2-NEXT: cwtl 181; AVX2-NEXT: vmovd %eax, %xmm2 182; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 183; AVX2-NEXT: vmovd %esi, %xmm3 184; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 185; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 186; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 187; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 188; AVX2-NEXT: retq 189; 190; AVX512F-LABEL: cvt_8i16_to_4f32: 191; AVX512F: # %bb.0: 192; AVX512F-NEXT: vmovq %xmm0, %rax 193; AVX512F-NEXT: movq %rax, %rcx 194; AVX512F-NEXT: movq %rax, %rdx 195; AVX512F-NEXT: movswl %ax, %esi 196; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 197; AVX512F-NEXT: shrl $16, %eax 198; AVX512F-NEXT: shrq $32, %rcx 199; AVX512F-NEXT: shrq $48, %rdx 200; AVX512F-NEXT: movswl %dx, %edx 201; AVX512F-NEXT: vmovd %edx, %xmm0 202; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 203; AVX512F-NEXT: movswl %cx, %ecx 204; AVX512F-NEXT: vmovd %ecx, %xmm1 205; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 206; AVX512F-NEXT: cwtl 207; AVX512F-NEXT: vmovd %eax, %xmm2 208; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 209; AVX512F-NEXT: vmovd %esi, %xmm3 210; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 211; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 212; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 213; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 214; AVX512F-NEXT: retq 215; 216; AVX512VL-LABEL: cvt_8i16_to_4f32: 217; AVX512VL: # %bb.0: 218; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 219; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 220; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 221; AVX512VL-NEXT: movq %rax, %rcx 222; AVX512VL-NEXT: movq %rax, %rdx 223; AVX512VL-NEXT: movswl %ax, %esi 224; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 225; AVX512VL-NEXT: shrl $16, %eax 226; AVX512VL-NEXT: shrq $32, %rcx 227; AVX512VL-NEXT: shrq $48, %rdx 228; AVX512VL-NEXT: movswl %dx, %edx 229; AVX512VL-NEXT: vmovd %edx, %xmm0 230; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 231; AVX512VL-NEXT: movswl %cx, %ecx 232; AVX512VL-NEXT: vmovd %ecx, %xmm1 233; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 234; AVX512VL-NEXT: cwtl 235; AVX512VL-NEXT: vmovd %eax, %xmm2 236; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 237; AVX512VL-NEXT: vmovd %esi, %xmm3 238; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 239; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 240; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 241; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 242; AVX512VL-NEXT: retq 243 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 244 %2 = bitcast <4 x i16> %1 to <4 x half> 245 %3 = fpext <4 x half> %2 to <4 x float> 246 ret <4 x float> %3 247} 248 249define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { 250; ALL-LABEL: cvt_8i16_to_8f32: 251; ALL: # %bb.0: 252; ALL-NEXT: vpextrq $1, %xmm0, %rdx 253; ALL-NEXT: movq %rdx, %r8 254; ALL-NEXT: movq %rdx, %r10 255; ALL-NEXT: movswl %dx, %r9d 256; ALL-NEXT: # kill: def $edx killed $edx killed $rdx 257; ALL-NEXT: shrl $16, %edx 258; ALL-NEXT: shrq $32, %r8 259; ALL-NEXT: shrq $48, %r10 260; ALL-NEXT: vmovq %xmm0, %rdi 261; ALL-NEXT: movq %rdi, %rax 262; ALL-NEXT: movq %rdi, %rsi 263; ALL-NEXT: movswl %di, %ecx 264; ALL-NEXT: # kill: def $edi killed $edi killed $rdi 265; ALL-NEXT: shrl $16, %edi 266; ALL-NEXT: shrq $32, %rax 267; ALL-NEXT: shrq $48, %rsi 268; ALL-NEXT: movswl %si, %esi 269; ALL-NEXT: vmovd %esi, %xmm0 270; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 271; ALL-NEXT: cwtl 272; ALL-NEXT: vmovd %eax, %xmm1 273; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 274; ALL-NEXT: movswl %di, %eax 275; ALL-NEXT: vmovd %eax, %xmm2 276; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 277; ALL-NEXT: vmovd %ecx, %xmm3 278; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 279; ALL-NEXT: movswl %r10w, %eax 280; ALL-NEXT: vmovd %eax, %xmm4 281; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 282; ALL-NEXT: movswl %r8w, %eax 283; ALL-NEXT: vmovd %eax, %xmm5 284; ALL-NEXT: vcvtph2ps %xmm5, %xmm5 285; ALL-NEXT: movswl %dx, %eax 286; ALL-NEXT: vmovd %eax, %xmm6 287; ALL-NEXT: vcvtph2ps %xmm6, %xmm6 288; ALL-NEXT: vmovd %r9d, %xmm7 289; ALL-NEXT: vcvtph2ps %xmm7, %xmm7 290; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] 291; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 292; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 293; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 294; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 295; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 296; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 297; ALL-NEXT: retq 298 %1 = bitcast <8 x i16> %a0 to <8 x half> 299 %2 = fpext <8 x half> %1 to <8 x float> 300 ret <8 x float> %2 301} 302 303define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { 304; AVX1-LABEL: cvt_16i16_to_16f32: 305; AVX1: # %bb.0: 306; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 307; AVX1-NEXT: vmovq %xmm4, %rax 308; AVX1-NEXT: movq %rax, %rcx 309; AVX1-NEXT: shrq $48, %rcx 310; AVX1-NEXT: movswl %cx, %ecx 311; AVX1-NEXT: vmovd %ecx, %xmm8 312; AVX1-NEXT: movq %rax, %rcx 313; AVX1-NEXT: shrq $32, %rcx 314; AVX1-NEXT: movswl %cx, %ecx 315; AVX1-NEXT: vmovd %ecx, %xmm9 316; AVX1-NEXT: movswl %ax, %ecx 317; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 318; AVX1-NEXT: shrl $16, %eax 319; AVX1-NEXT: cwtl 320; AVX1-NEXT: vmovd %eax, %xmm10 321; AVX1-NEXT: vpextrq $1, %xmm4, %rax 322; AVX1-NEXT: vmovd %ecx, %xmm11 323; AVX1-NEXT: movq %rax, %rcx 324; AVX1-NEXT: shrq $48, %rcx 325; AVX1-NEXT: movswl %cx, %ecx 326; AVX1-NEXT: vmovd %ecx, %xmm12 327; AVX1-NEXT: movq %rax, %rcx 328; AVX1-NEXT: shrq $32, %rcx 329; AVX1-NEXT: movswl %cx, %ecx 330; AVX1-NEXT: vmovd %ecx, %xmm13 331; AVX1-NEXT: movswl %ax, %ecx 332; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 333; AVX1-NEXT: shrl $16, %eax 334; AVX1-NEXT: cwtl 335; AVX1-NEXT: vmovd %eax, %xmm14 336; AVX1-NEXT: vmovq %xmm0, %rax 337; AVX1-NEXT: vmovd %ecx, %xmm15 338; AVX1-NEXT: movq %rax, %rcx 339; AVX1-NEXT: shrq $48, %rcx 340; AVX1-NEXT: movswl %cx, %ecx 341; AVX1-NEXT: vmovd %ecx, %xmm2 342; AVX1-NEXT: movq %rax, %rcx 343; AVX1-NEXT: shrq $32, %rcx 344; AVX1-NEXT: movswl %cx, %ecx 345; AVX1-NEXT: vmovd %ecx, %xmm3 346; AVX1-NEXT: movswl %ax, %ecx 347; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 348; AVX1-NEXT: shrl $16, %eax 349; AVX1-NEXT: cwtl 350; AVX1-NEXT: vmovd %eax, %xmm4 351; AVX1-NEXT: vpextrq $1, %xmm0, %rax 352; AVX1-NEXT: vmovd %ecx, %xmm0 353; AVX1-NEXT: movq %rax, %rcx 354; AVX1-NEXT: shrq $48, %rcx 355; AVX1-NEXT: movswl %cx, %ecx 356; AVX1-NEXT: vmovd %ecx, %xmm5 357; AVX1-NEXT: movq %rax, %rcx 358; AVX1-NEXT: shrq $32, %rcx 359; AVX1-NEXT: movswl %cx, %ecx 360; AVX1-NEXT: vmovd %ecx, %xmm6 361; AVX1-NEXT: movl %eax, %ecx 362; AVX1-NEXT: shrl $16, %ecx 363; AVX1-NEXT: movswl %cx, %ecx 364; AVX1-NEXT: vmovd %ecx, %xmm7 365; AVX1-NEXT: cwtl 366; AVX1-NEXT: vmovd %eax, %xmm1 367; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8 368; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9 369; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10 370; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11 371; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12 372; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13 373; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14 374; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15 375; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 376; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 377; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 378; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 379; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 380; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 381; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 382; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 383; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] 384; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 385; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 386; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] 387; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] 388; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 389; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 390; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] 391; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 392; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 393; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] 394; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 395; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 396; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 397; AVX1-NEXT: retq 398; 399; AVX2-LABEL: cvt_16i16_to_16f32: 400; AVX2: # %bb.0: 401; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 402; AVX2-NEXT: vmovq %xmm4, %rax 403; AVX2-NEXT: movq %rax, %rcx 404; AVX2-NEXT: shrq $48, %rcx 405; AVX2-NEXT: movswl %cx, %ecx 406; AVX2-NEXT: vmovd %ecx, %xmm8 407; AVX2-NEXT: movq %rax, %rcx 408; AVX2-NEXT: shrq $32, %rcx 409; AVX2-NEXT: movswl %cx, %ecx 410; AVX2-NEXT: vmovd %ecx, %xmm9 411; AVX2-NEXT: movswl %ax, %ecx 412; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 413; AVX2-NEXT: shrl $16, %eax 414; AVX2-NEXT: cwtl 415; AVX2-NEXT: vmovd %eax, %xmm10 416; AVX2-NEXT: vpextrq $1, %xmm4, %rax 417; AVX2-NEXT: vmovd %ecx, %xmm11 418; AVX2-NEXT: movq %rax, %rcx 419; AVX2-NEXT: shrq $48, %rcx 420; AVX2-NEXT: movswl %cx, %ecx 421; AVX2-NEXT: vmovd %ecx, %xmm12 422; AVX2-NEXT: movq %rax, %rcx 423; AVX2-NEXT: shrq $32, %rcx 424; AVX2-NEXT: movswl %cx, %ecx 425; AVX2-NEXT: vmovd %ecx, %xmm13 426; AVX2-NEXT: movswl %ax, %ecx 427; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 428; AVX2-NEXT: shrl $16, %eax 429; AVX2-NEXT: cwtl 430; AVX2-NEXT: vmovd %eax, %xmm14 431; AVX2-NEXT: vmovq %xmm0, %rax 432; AVX2-NEXT: vmovd %ecx, %xmm15 433; AVX2-NEXT: movq %rax, %rcx 434; AVX2-NEXT: shrq $48, %rcx 435; AVX2-NEXT: movswl %cx, %ecx 436; AVX2-NEXT: vmovd %ecx, %xmm2 437; AVX2-NEXT: movq %rax, %rcx 438; AVX2-NEXT: shrq $32, %rcx 439; AVX2-NEXT: movswl %cx, %ecx 440; AVX2-NEXT: vmovd %ecx, %xmm3 441; AVX2-NEXT: movswl %ax, %ecx 442; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 443; AVX2-NEXT: shrl $16, %eax 444; AVX2-NEXT: cwtl 445; AVX2-NEXT: vmovd %eax, %xmm4 446; AVX2-NEXT: vpextrq $1, %xmm0, %rax 447; AVX2-NEXT: vmovd %ecx, %xmm0 448; AVX2-NEXT: movq %rax, %rcx 449; AVX2-NEXT: shrq $48, %rcx 450; AVX2-NEXT: movswl %cx, %ecx 451; AVX2-NEXT: vmovd %ecx, %xmm5 452; AVX2-NEXT: movq %rax, %rcx 453; AVX2-NEXT: shrq $32, %rcx 454; AVX2-NEXT: movswl %cx, %ecx 455; AVX2-NEXT: vmovd %ecx, %xmm6 456; AVX2-NEXT: movl %eax, %ecx 457; AVX2-NEXT: shrl $16, %ecx 458; AVX2-NEXT: movswl %cx, %ecx 459; AVX2-NEXT: vmovd %ecx, %xmm7 460; AVX2-NEXT: cwtl 461; AVX2-NEXT: vmovd %eax, %xmm1 462; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8 463; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9 464; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10 465; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11 466; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12 467; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13 468; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14 469; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15 470; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 471; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 472; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 473; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 474; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 475; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 476; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 477; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 478; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] 479; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 480; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 481; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] 482; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] 483; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 484; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 485; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] 486; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 487; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 488; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] 489; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 490; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 491; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 492; AVX2-NEXT: retq 493; 494; AVX512F-LABEL: cvt_16i16_to_16f32: 495; AVX512F: # %bb.0: 496; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10 497; AVX512F-NEXT: vmovq %xmm0, %rax 498; AVX512F-NEXT: movq %rax, %rcx 499; AVX512F-NEXT: shrq $48, %rcx 500; AVX512F-NEXT: movswl %cx, %ecx 501; AVX512F-NEXT: vmovd %ecx, %xmm8 502; AVX512F-NEXT: movq %rax, %rcx 503; AVX512F-NEXT: shrq $32, %rcx 504; AVX512F-NEXT: movswl %cx, %ecx 505; AVX512F-NEXT: vmovd %ecx, %xmm9 506; AVX512F-NEXT: movswl %ax, %ecx 507; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 508; AVX512F-NEXT: shrl $16, %eax 509; AVX512F-NEXT: cwtl 510; AVX512F-NEXT: vmovd %eax, %xmm11 511; AVX512F-NEXT: vpextrq $1, %xmm0, %rax 512; AVX512F-NEXT: vmovd %ecx, %xmm12 513; AVX512F-NEXT: movq %rax, %rcx 514; AVX512F-NEXT: shrq $48, %rcx 515; AVX512F-NEXT: movswl %cx, %ecx 516; AVX512F-NEXT: vmovd %ecx, %xmm13 517; AVX512F-NEXT: movq %rax, %rcx 518; AVX512F-NEXT: shrq $32, %rcx 519; AVX512F-NEXT: movswl %cx, %ecx 520; AVX512F-NEXT: vmovd %ecx, %xmm14 521; AVX512F-NEXT: movswl %ax, %ecx 522; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 523; AVX512F-NEXT: shrl $16, %eax 524; AVX512F-NEXT: cwtl 525; AVX512F-NEXT: vmovd %eax, %xmm15 526; AVX512F-NEXT: vmovq %xmm10, %rax 527; AVX512F-NEXT: vmovd %ecx, %xmm2 528; AVX512F-NEXT: movq %rax, %rcx 529; AVX512F-NEXT: shrq $48, %rcx 530; AVX512F-NEXT: movswl %cx, %ecx 531; AVX512F-NEXT: vmovd %ecx, %xmm3 532; AVX512F-NEXT: movq %rax, %rcx 533; AVX512F-NEXT: shrq $32, %rcx 534; AVX512F-NEXT: movswl %cx, %ecx 535; AVX512F-NEXT: vmovd %ecx, %xmm1 536; AVX512F-NEXT: movswl %ax, %ecx 537; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 538; AVX512F-NEXT: shrl $16, %eax 539; AVX512F-NEXT: cwtl 540; AVX512F-NEXT: vmovd %eax, %xmm4 541; AVX512F-NEXT: vpextrq $1, %xmm10, %rax 542; AVX512F-NEXT: vmovd %ecx, %xmm10 543; AVX512F-NEXT: movq %rax, %rcx 544; AVX512F-NEXT: shrq $48, %rcx 545; AVX512F-NEXT: movswl %cx, %ecx 546; AVX512F-NEXT: vmovd %ecx, %xmm5 547; AVX512F-NEXT: movq %rax, %rcx 548; AVX512F-NEXT: shrq $32, %rcx 549; AVX512F-NEXT: movswl %cx, %ecx 550; AVX512F-NEXT: vmovd %ecx, %xmm6 551; AVX512F-NEXT: movl %eax, %ecx 552; AVX512F-NEXT: shrl $16, %ecx 553; AVX512F-NEXT: movswl %cx, %ecx 554; AVX512F-NEXT: vmovd %ecx, %xmm7 555; AVX512F-NEXT: cwtl 556; AVX512F-NEXT: vmovd %eax, %xmm0 557; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8 558; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9 559; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11 560; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12 561; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13 562; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14 563; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15 564; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 565; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 566; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 567; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 568; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10 569; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 570; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 571; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 572; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 573; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3] 574; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3] 575; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0] 576; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3] 577; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3] 578; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] 579; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 580; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3] 581; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] 582; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] 583; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] 584; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 585; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 586; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 587; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 588; AVX512F-NEXT: retq 589; 590; AVX512VL-LABEL: cvt_16i16_to_16f32: 591; AVX512VL: # %bb.0: 592; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10 593; AVX512VL-NEXT: vmovq %xmm0, %rax 594; AVX512VL-NEXT: movq %rax, %rcx 595; AVX512VL-NEXT: shrq $48, %rcx 596; AVX512VL-NEXT: movswl %cx, %ecx 597; AVX512VL-NEXT: vmovd %ecx, %xmm8 598; AVX512VL-NEXT: movq %rax, %rcx 599; AVX512VL-NEXT: shrq $32, %rcx 600; AVX512VL-NEXT: movswl %cx, %ecx 601; AVX512VL-NEXT: vmovd %ecx, %xmm9 602; AVX512VL-NEXT: movswl %ax, %ecx 603; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 604; AVX512VL-NEXT: shrl $16, %eax 605; AVX512VL-NEXT: cwtl 606; AVX512VL-NEXT: vmovd %eax, %xmm11 607; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax 608; AVX512VL-NEXT: vmovd %ecx, %xmm12 609; AVX512VL-NEXT: movq %rax, %rcx 610; AVX512VL-NEXT: shrq $48, %rcx 611; AVX512VL-NEXT: movswl %cx, %ecx 612; AVX512VL-NEXT: vmovd %ecx, %xmm13 613; AVX512VL-NEXT: movq %rax, %rcx 614; AVX512VL-NEXT: shrq $32, %rcx 615; AVX512VL-NEXT: movswl %cx, %ecx 616; AVX512VL-NEXT: vmovd %ecx, %xmm14 617; AVX512VL-NEXT: movswl %ax, %ecx 618; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 619; AVX512VL-NEXT: shrl $16, %eax 620; AVX512VL-NEXT: cwtl 621; AVX512VL-NEXT: vmovd %eax, %xmm15 622; AVX512VL-NEXT: vmovq %xmm10, %rax 623; AVX512VL-NEXT: vmovd %ecx, %xmm16 624; AVX512VL-NEXT: movq %rax, %rcx 625; AVX512VL-NEXT: shrq $48, %rcx 626; AVX512VL-NEXT: movswl %cx, %ecx 627; AVX512VL-NEXT: vmovd %ecx, %xmm17 628; AVX512VL-NEXT: movq %rax, %rcx 629; AVX512VL-NEXT: shrq $32, %rcx 630; AVX512VL-NEXT: movswl %cx, %ecx 631; AVX512VL-NEXT: vmovd %ecx, %xmm18 632; AVX512VL-NEXT: movswl %ax, %ecx 633; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 634; AVX512VL-NEXT: shrl $16, %eax 635; AVX512VL-NEXT: cwtl 636; AVX512VL-NEXT: vmovd %eax, %xmm19 637; AVX512VL-NEXT: vpextrq $1, %xmm10, %rax 638; AVX512VL-NEXT: vmovd %ecx, %xmm10 639; AVX512VL-NEXT: movq %rax, %rcx 640; AVX512VL-NEXT: shrq $48, %rcx 641; AVX512VL-NEXT: movswl %cx, %ecx 642; AVX512VL-NEXT: vmovd %ecx, %xmm20 643; AVX512VL-NEXT: movq %rax, %rcx 644; AVX512VL-NEXT: shrq $32, %rcx 645; AVX512VL-NEXT: movswl %cx, %ecx 646; AVX512VL-NEXT: vmovd %ecx, %xmm21 647; AVX512VL-NEXT: movl %eax, %ecx 648; AVX512VL-NEXT: shrl $16, %ecx 649; AVX512VL-NEXT: movswl %cx, %ecx 650; AVX512VL-NEXT: vmovd %ecx, %xmm22 651; AVX512VL-NEXT: cwtl 652; AVX512VL-NEXT: vmovd %eax, %xmm2 653; AVX512VL-NEXT: vcvtph2ps %xmm8, %xmm8 654; AVX512VL-NEXT: vcvtph2ps %xmm9, %xmm9 655; AVX512VL-NEXT: vcvtph2ps %xmm11, %xmm11 656; AVX512VL-NEXT: vcvtph2ps %xmm12, %xmm12 657; AVX512VL-NEXT: vcvtph2ps %xmm13, %xmm13 658; AVX512VL-NEXT: vcvtph2ps %xmm14, %xmm14 659; AVX512VL-NEXT: vcvtph2ps %xmm15, %xmm15 660; AVX512VL-NEXT: vcvtph2ps %xmm16, %xmm16 661; AVX512VL-NEXT: vcvtph2ps %xmm17, %xmm4 662; AVX512VL-NEXT: vcvtph2ps %xmm18, %xmm0 663; AVX512VL-NEXT: vcvtph2ps %xmm19, %xmm5 664; AVX512VL-NEXT: vcvtph2ps %xmm10, %xmm7 665; AVX512VL-NEXT: vcvtph2ps %xmm20, %xmm3 666; AVX512VL-NEXT: vcvtph2ps %xmm21, %xmm6 667; AVX512VL-NEXT: vcvtph2ps %xmm22, %xmm1 668; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 669; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 670; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 671; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] 672; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3] 673; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] 674; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] 675; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 676; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3] 677; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] 678; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] 679; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] 680; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 681; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 682; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 683; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 684; AVX512VL-NEXT: retq 685 %1 = bitcast <16 x i16> %a0 to <16 x half> 686 %2 = fpext <16 x half> %1 to <16 x float> 687 ret <16 x float> %2 688} 689 690; 691; Half to Float (Load) 692; 693 694define float @load_cvt_i16_to_f32(i16* %a0) nounwind { 695; ALL-LABEL: load_cvt_i16_to_f32: 696; ALL: # %bb.0: 697; ALL-NEXT: movswl (%rdi), %eax 698; ALL-NEXT: vmovd %eax, %xmm0 699; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 700; ALL-NEXT: retq 701 %1 = load i16, i16* %a0 702 %2 = bitcast i16 %1 to half 703 %3 = fpext half %2 to float 704 ret float %3 705} 706 707define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind { 708; ALL-LABEL: load_cvt_4i16_to_4f32: 709; ALL: # %bb.0: 710; ALL-NEXT: movswl 6(%rdi), %eax 711; ALL-NEXT: vmovd %eax, %xmm0 712; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 713; ALL-NEXT: movswl 4(%rdi), %eax 714; ALL-NEXT: vmovd %eax, %xmm1 715; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 716; ALL-NEXT: movswl (%rdi), %eax 717; ALL-NEXT: vmovd %eax, %xmm2 718; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 719; ALL-NEXT: movswl 2(%rdi), %eax 720; ALL-NEXT: vmovd %eax, %xmm3 721; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 722; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 723; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 724; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 725; ALL-NEXT: retq 726 %1 = load <4 x i16>, <4 x i16>* %a0 727 %2 = bitcast <4 x i16> %1 to <4 x half> 728 %3 = fpext <4 x half> %2 to <4 x float> 729 ret <4 x float> %3 730} 731 732define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind { 733; AVX1-LABEL: load_cvt_8i16_to_4f32: 734; AVX1: # %bb.0: 735; AVX1-NEXT: movq (%rdi), %rax 736; AVX1-NEXT: movq %rax, %rcx 737; AVX1-NEXT: movq %rax, %rdx 738; AVX1-NEXT: movswl %ax, %esi 739; AVX1-NEXT: # kill: def $eax killed $eax killed $rax 740; AVX1-NEXT: shrl $16, %eax 741; AVX1-NEXT: shrq $32, %rcx 742; AVX1-NEXT: shrq $48, %rdx 743; AVX1-NEXT: movswl %dx, %edx 744; AVX1-NEXT: vmovd %edx, %xmm0 745; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 746; AVX1-NEXT: movswl %cx, %ecx 747; AVX1-NEXT: vmovd %ecx, %xmm1 748; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 749; AVX1-NEXT: cwtl 750; AVX1-NEXT: vmovd %eax, %xmm2 751; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 752; AVX1-NEXT: vmovd %esi, %xmm3 753; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 754; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 755; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 756; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 757; AVX1-NEXT: retq 758; 759; AVX2-LABEL: load_cvt_8i16_to_4f32: 760; AVX2: # %bb.0: 761; AVX2-NEXT: movq (%rdi), %rax 762; AVX2-NEXT: movq %rax, %rcx 763; AVX2-NEXT: movq %rax, %rdx 764; AVX2-NEXT: movswl %ax, %esi 765; AVX2-NEXT: # kill: def $eax killed $eax killed $rax 766; AVX2-NEXT: shrl $16, %eax 767; AVX2-NEXT: shrq $32, %rcx 768; AVX2-NEXT: shrq $48, %rdx 769; AVX2-NEXT: movswl %dx, %edx 770; AVX2-NEXT: vmovd %edx, %xmm0 771; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 772; AVX2-NEXT: movswl %cx, %ecx 773; AVX2-NEXT: vmovd %ecx, %xmm1 774; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 775; AVX2-NEXT: cwtl 776; AVX2-NEXT: vmovd %eax, %xmm2 777; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 778; AVX2-NEXT: vmovd %esi, %xmm3 779; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 780; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 781; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 782; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 783; AVX2-NEXT: retq 784; 785; AVX512F-LABEL: load_cvt_8i16_to_4f32: 786; AVX512F: # %bb.0: 787; AVX512F-NEXT: movq (%rdi), %rax 788; AVX512F-NEXT: movq %rax, %rcx 789; AVX512F-NEXT: movq %rax, %rdx 790; AVX512F-NEXT: movswl %ax, %esi 791; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax 792; AVX512F-NEXT: shrl $16, %eax 793; AVX512F-NEXT: shrq $32, %rcx 794; AVX512F-NEXT: shrq $48, %rdx 795; AVX512F-NEXT: movswl %dx, %edx 796; AVX512F-NEXT: vmovd %edx, %xmm0 797; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 798; AVX512F-NEXT: movswl %cx, %ecx 799; AVX512F-NEXT: vmovd %ecx, %xmm1 800; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 801; AVX512F-NEXT: cwtl 802; AVX512F-NEXT: vmovd %eax, %xmm2 803; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 804; AVX512F-NEXT: vmovd %esi, %xmm3 805; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 806; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 807; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 808; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 809; AVX512F-NEXT: retq 810; 811; AVX512VL-LABEL: load_cvt_8i16_to_4f32: 812; AVX512VL: # %bb.0: 813; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 814; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 815; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 816; AVX512VL-NEXT: movq %rax, %rcx 817; AVX512VL-NEXT: movq %rax, %rdx 818; AVX512VL-NEXT: movswl %ax, %esi 819; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax 820; AVX512VL-NEXT: shrl $16, %eax 821; AVX512VL-NEXT: shrq $32, %rcx 822; AVX512VL-NEXT: shrq $48, %rdx 823; AVX512VL-NEXT: movswl %dx, %edx 824; AVX512VL-NEXT: vmovd %edx, %xmm0 825; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 826; AVX512VL-NEXT: movswl %cx, %ecx 827; AVX512VL-NEXT: vmovd %ecx, %xmm1 828; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 829; AVX512VL-NEXT: cwtl 830; AVX512VL-NEXT: vmovd %eax, %xmm2 831; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 832; AVX512VL-NEXT: vmovd %esi, %xmm3 833; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 834; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 835; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 836; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 837; AVX512VL-NEXT: retq 838 %1 = load <8 x i16>, <8 x i16>* %a0 839 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 840 %3 = bitcast <4 x i16> %2 to <4 x half> 841 %4 = fpext <4 x half> %3 to <4 x float> 842 ret <4 x float> %4 843} 844 845define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind { 846; ALL-LABEL: load_cvt_8i16_to_8f32: 847; ALL: # %bb.0: 848; ALL-NEXT: movswl 6(%rdi), %eax 849; ALL-NEXT: vmovd %eax, %xmm0 850; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 851; ALL-NEXT: movswl 4(%rdi), %eax 852; ALL-NEXT: vmovd %eax, %xmm1 853; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 854; ALL-NEXT: movswl (%rdi), %eax 855; ALL-NEXT: vmovd %eax, %xmm2 856; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 857; ALL-NEXT: movswl 2(%rdi), %eax 858; ALL-NEXT: vmovd %eax, %xmm3 859; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 860; ALL-NEXT: movswl 14(%rdi), %eax 861; ALL-NEXT: vmovd %eax, %xmm4 862; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 863; ALL-NEXT: movswl 12(%rdi), %eax 864; ALL-NEXT: vmovd %eax, %xmm5 865; ALL-NEXT: vcvtph2ps %xmm5, %xmm5 866; ALL-NEXT: movswl 8(%rdi), %eax 867; ALL-NEXT: vmovd %eax, %xmm6 868; ALL-NEXT: vcvtph2ps %xmm6, %xmm6 869; ALL-NEXT: movswl 10(%rdi), %eax 870; ALL-NEXT: vmovd %eax, %xmm7 871; ALL-NEXT: vcvtph2ps %xmm7, %xmm7 872; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 873; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 874; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 875; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 876; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 877; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 878; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 879; ALL-NEXT: retq 880 %1 = load <8 x i16>, <8 x i16>* %a0 881 %2 = bitcast <8 x i16> %1 to <8 x half> 882 %3 = fpext <8 x half> %2 to <8 x float> 883 ret <8 x float> %3 884} 885 886define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { 887; AVX1-LABEL: load_cvt_16i16_to_16f32: 888; AVX1: # %bb.0: 889; AVX1-NEXT: movswl 22(%rdi), %eax 890; AVX1-NEXT: vmovd %eax, %xmm0 891; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8 892; AVX1-NEXT: movswl 20(%rdi), %eax 893; AVX1-NEXT: vmovd %eax, %xmm0 894; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9 895; AVX1-NEXT: movswl 16(%rdi), %eax 896; AVX1-NEXT: vmovd %eax, %xmm0 897; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10 898; AVX1-NEXT: movswl 18(%rdi), %eax 899; AVX1-NEXT: vmovd %eax, %xmm0 900; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11 901; AVX1-NEXT: movswl 30(%rdi), %eax 902; AVX1-NEXT: vmovd %eax, %xmm0 903; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12 904; AVX1-NEXT: movswl 28(%rdi), %eax 905; AVX1-NEXT: vmovd %eax, %xmm0 906; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13 907; AVX1-NEXT: movswl 24(%rdi), %eax 908; AVX1-NEXT: vmovd %eax, %xmm0 909; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14 910; AVX1-NEXT: movswl 26(%rdi), %eax 911; AVX1-NEXT: vmovd %eax, %xmm0 912; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15 913; AVX1-NEXT: movswl 6(%rdi), %eax 914; AVX1-NEXT: vmovd %eax, %xmm0 915; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 916; AVX1-NEXT: movswl 4(%rdi), %eax 917; AVX1-NEXT: vmovd %eax, %xmm2 918; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 919; AVX1-NEXT: movswl (%rdi), %eax 920; AVX1-NEXT: vmovd %eax, %xmm3 921; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 922; AVX1-NEXT: movswl 2(%rdi), %eax 923; AVX1-NEXT: vmovd %eax, %xmm4 924; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 925; AVX1-NEXT: movswl 14(%rdi), %eax 926; AVX1-NEXT: vmovd %eax, %xmm5 927; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 928; AVX1-NEXT: movswl 12(%rdi), %eax 929; AVX1-NEXT: vmovd %eax, %xmm6 930; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 931; AVX1-NEXT: movswl 8(%rdi), %eax 932; AVX1-NEXT: vmovd %eax, %xmm7 933; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 934; AVX1-NEXT: movswl 10(%rdi), %eax 935; AVX1-NEXT: vmovd %eax, %xmm1 936; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 937; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] 938; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 939; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 940; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] 941; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 942; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 943; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 944; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 945; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 946; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 947; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 948; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 949; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 950; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 951; AVX1-NEXT: retq 952; 953; AVX2-LABEL: load_cvt_16i16_to_16f32: 954; AVX2: # %bb.0: 955; AVX2-NEXT: movswl 22(%rdi), %eax 956; AVX2-NEXT: vmovd %eax, %xmm0 957; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8 958; AVX2-NEXT: movswl 20(%rdi), %eax 959; AVX2-NEXT: vmovd %eax, %xmm0 960; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9 961; AVX2-NEXT: movswl 16(%rdi), %eax 962; AVX2-NEXT: vmovd %eax, %xmm0 963; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10 964; AVX2-NEXT: movswl 18(%rdi), %eax 965; AVX2-NEXT: vmovd %eax, %xmm0 966; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11 967; AVX2-NEXT: movswl 30(%rdi), %eax 968; AVX2-NEXT: vmovd %eax, %xmm0 969; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12 970; AVX2-NEXT: movswl 28(%rdi), %eax 971; AVX2-NEXT: vmovd %eax, %xmm0 972; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13 973; AVX2-NEXT: movswl 24(%rdi), %eax 974; AVX2-NEXT: vmovd %eax, %xmm0 975; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14 976; AVX2-NEXT: movswl 26(%rdi), %eax 977; AVX2-NEXT: vmovd %eax, %xmm0 978; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15 979; AVX2-NEXT: movswl 6(%rdi), %eax 980; AVX2-NEXT: vmovd %eax, %xmm0 981; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 982; AVX2-NEXT: movswl 4(%rdi), %eax 983; AVX2-NEXT: vmovd %eax, %xmm2 984; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 985; AVX2-NEXT: movswl (%rdi), %eax 986; AVX2-NEXT: vmovd %eax, %xmm3 987; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 988; AVX2-NEXT: movswl 2(%rdi), %eax 989; AVX2-NEXT: vmovd %eax, %xmm4 990; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 991; AVX2-NEXT: movswl 14(%rdi), %eax 992; AVX2-NEXT: vmovd %eax, %xmm5 993; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 994; AVX2-NEXT: movswl 12(%rdi), %eax 995; AVX2-NEXT: vmovd %eax, %xmm6 996; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 997; AVX2-NEXT: movswl 8(%rdi), %eax 998; AVX2-NEXT: vmovd %eax, %xmm7 999; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 1000; AVX2-NEXT: movswl 10(%rdi), %eax 1001; AVX2-NEXT: vmovd %eax, %xmm1 1002; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 1003; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] 1004; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 1005; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 1006; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] 1007; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 1008; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 1009; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1010; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 1011; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 1012; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 1013; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 1014; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 1015; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 1016; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1017; AVX2-NEXT: retq 1018; 1019; AVX512F-LABEL: load_cvt_16i16_to_16f32: 1020; AVX512F: # %bb.0: 1021; AVX512F-NEXT: movswl 6(%rdi), %eax 1022; AVX512F-NEXT: vmovd %eax, %xmm0 1023; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8 1024; AVX512F-NEXT: movswl 4(%rdi), %eax 1025; AVX512F-NEXT: vmovd %eax, %xmm0 1026; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9 1027; AVX512F-NEXT: movswl (%rdi), %eax 1028; AVX512F-NEXT: vmovd %eax, %xmm0 1029; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10 1030; AVX512F-NEXT: movswl 2(%rdi), %eax 1031; AVX512F-NEXT: vmovd %eax, %xmm0 1032; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11 1033; AVX512F-NEXT: movswl 14(%rdi), %eax 1034; AVX512F-NEXT: vmovd %eax, %xmm0 1035; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12 1036; AVX512F-NEXT: movswl 12(%rdi), %eax 1037; AVX512F-NEXT: vmovd %eax, %xmm0 1038; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13 1039; AVX512F-NEXT: movswl 8(%rdi), %eax 1040; AVX512F-NEXT: vmovd %eax, %xmm0 1041; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14 1042; AVX512F-NEXT: movswl 10(%rdi), %eax 1043; AVX512F-NEXT: vmovd %eax, %xmm0 1044; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15 1045; AVX512F-NEXT: movswl 22(%rdi), %eax 1046; AVX512F-NEXT: vmovd %eax, %xmm0 1047; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1048; AVX512F-NEXT: movswl 20(%rdi), %eax 1049; AVX512F-NEXT: vmovd %eax, %xmm1 1050; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1051; AVX512F-NEXT: movswl 16(%rdi), %eax 1052; AVX512F-NEXT: vmovd %eax, %xmm2 1053; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 1054; AVX512F-NEXT: movswl 18(%rdi), %eax 1055; AVX512F-NEXT: vmovd %eax, %xmm3 1056; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 1057; AVX512F-NEXT: movswl 30(%rdi), %eax 1058; AVX512F-NEXT: vmovd %eax, %xmm4 1059; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 1060; AVX512F-NEXT: movswl 28(%rdi), %eax 1061; AVX512F-NEXT: vmovd %eax, %xmm5 1062; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 1063; AVX512F-NEXT: movswl 24(%rdi), %eax 1064; AVX512F-NEXT: vmovd %eax, %xmm6 1065; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 1066; AVX512F-NEXT: movswl 26(%rdi), %eax 1067; AVX512F-NEXT: vmovd %eax, %xmm7 1068; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 1069; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 1070; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 1071; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 1072; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 1073; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 1074; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1075; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1076; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 1077; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 1078; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 1079; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 1080; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 1081; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 1082; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1083; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 1084; AVX512F-NEXT: retq 1085; 1086; AVX512VL-LABEL: load_cvt_16i16_to_16f32: 1087; AVX512VL: # %bb.0: 1088; AVX512VL-NEXT: movswl 6(%rdi), %eax 1089; AVX512VL-NEXT: vmovd %eax, %xmm0 1090; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm8 1091; AVX512VL-NEXT: movswl 4(%rdi), %eax 1092; AVX512VL-NEXT: vmovd %eax, %xmm1 1093; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm9 1094; AVX512VL-NEXT: movswl (%rdi), %eax 1095; AVX512VL-NEXT: vmovd %eax, %xmm2 1096; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm10 1097; AVX512VL-NEXT: movswl 2(%rdi), %eax 1098; AVX512VL-NEXT: vmovd %eax, %xmm3 1099; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm11 1100; AVX512VL-NEXT: movswl 14(%rdi), %eax 1101; AVX512VL-NEXT: vmovd %eax, %xmm4 1102; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm12 1103; AVX512VL-NEXT: movswl 12(%rdi), %eax 1104; AVX512VL-NEXT: vmovd %eax, %xmm5 1105; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm13 1106; AVX512VL-NEXT: movswl 8(%rdi), %eax 1107; AVX512VL-NEXT: vmovd %eax, %xmm6 1108; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm14 1109; AVX512VL-NEXT: movswl 10(%rdi), %eax 1110; AVX512VL-NEXT: vmovd %eax, %xmm7 1111; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm15 1112; AVX512VL-NEXT: movswl 22(%rdi), %eax 1113; AVX512VL-NEXT: vmovd %eax, %xmm0 1114; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1115; AVX512VL-NEXT: movswl 20(%rdi), %eax 1116; AVX512VL-NEXT: vmovd %eax, %xmm1 1117; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1118; AVX512VL-NEXT: movswl 16(%rdi), %eax 1119; AVX512VL-NEXT: vmovd %eax, %xmm2 1120; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 1121; AVX512VL-NEXT: movswl 18(%rdi), %eax 1122; AVX512VL-NEXT: vmovd %eax, %xmm3 1123; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 1124; AVX512VL-NEXT: movswl 30(%rdi), %eax 1125; AVX512VL-NEXT: vmovd %eax, %xmm4 1126; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 1127; AVX512VL-NEXT: movswl 28(%rdi), %eax 1128; AVX512VL-NEXT: vmovd %eax, %xmm5 1129; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 1130; AVX512VL-NEXT: movswl 24(%rdi), %eax 1131; AVX512VL-NEXT: vmovd %eax, %xmm6 1132; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 1133; AVX512VL-NEXT: movswl 26(%rdi), %eax 1134; AVX512VL-NEXT: vmovd %eax, %xmm7 1135; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 1136; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 1137; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 1138; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 1139; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 1140; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 1141; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1142; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 1143; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 1144; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 1145; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 1146; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 1147; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 1148; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 1149; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1150; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 1151; AVX512VL-NEXT: retq 1152 %1 = load <16 x i16>, <16 x i16>* %a0 1153 %2 = bitcast <16 x i16> %1 to <16 x half> 1154 %3 = fpext <16 x half> %2 to <16 x float> 1155 ret <16 x float> %3 1156} 1157 1158; 1159; Half to Double 1160; 1161 1162define double @cvt_i16_to_f64(i16 %a0) nounwind { 1163; ALL-LABEL: cvt_i16_to_f64: 1164; ALL: # %bb.0: 1165; ALL-NEXT: movswl %di, %eax 1166; ALL-NEXT: vmovd %eax, %xmm0 1167; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1168; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1169; ALL-NEXT: retq 1170 %1 = bitcast i16 %a0 to half 1171 %2 = fpext half %1 to double 1172 ret double %2 1173} 1174 1175define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind { 1176; AVX1-LABEL: cvt_2i16_to_2f64: 1177; AVX1: # %bb.0: 1178; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1179; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1180; AVX1-NEXT: vmovd %xmm0, %eax 1181; AVX1-NEXT: movswl %ax, %ecx 1182; AVX1-NEXT: shrl $16, %eax 1183; AVX1-NEXT: cwtl 1184; AVX1-NEXT: vmovd %eax, %xmm0 1185; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1186; AVX1-NEXT: vmovd %ecx, %xmm1 1187; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 1188; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1189; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1190; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1191; AVX1-NEXT: retq 1192; 1193; AVX2-SLOW-LABEL: cvt_2i16_to_2f64: 1194; AVX2-SLOW: # %bb.0: 1195; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1196; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1197; AVX2-SLOW-NEXT: vmovd %xmm0, %eax 1198; AVX2-SLOW-NEXT: movswl %ax, %ecx 1199; AVX2-SLOW-NEXT: shrl $16, %eax 1200; AVX2-SLOW-NEXT: cwtl 1201; AVX2-SLOW-NEXT: vmovd %eax, %xmm0 1202; AVX2-SLOW-NEXT: vcvtph2ps %xmm0, %xmm0 1203; AVX2-SLOW-NEXT: vmovd %ecx, %xmm1 1204; AVX2-SLOW-NEXT: vcvtph2ps %xmm1, %xmm1 1205; AVX2-SLOW-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1206; AVX2-SLOW-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1207; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1208; AVX2-SLOW-NEXT: retq 1209; 1210; AVX2-FAST-LABEL: cvt_2i16_to_2f64: 1211; AVX2-FAST: # %bb.0: 1212; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] 1213; AVX2-FAST-NEXT: vmovd %xmm0, %eax 1214; AVX2-FAST-NEXT: movswl %ax, %ecx 1215; AVX2-FAST-NEXT: shrl $16, %eax 1216; AVX2-FAST-NEXT: cwtl 1217; AVX2-FAST-NEXT: vmovd %eax, %xmm0 1218; AVX2-FAST-NEXT: vcvtph2ps %xmm0, %xmm0 1219; AVX2-FAST-NEXT: vmovd %ecx, %xmm1 1220; AVX2-FAST-NEXT: vcvtph2ps %xmm1, %xmm1 1221; AVX2-FAST-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1222; AVX2-FAST-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1223; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1224; AVX2-FAST-NEXT: retq 1225; 1226; AVX512F-LABEL: cvt_2i16_to_2f64: 1227; AVX512F: # %bb.0: 1228; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1229; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1230; AVX512F-NEXT: vmovd %xmm0, %eax 1231; AVX512F-NEXT: movswl %ax, %ecx 1232; AVX512F-NEXT: shrl $16, %eax 1233; AVX512F-NEXT: cwtl 1234; AVX512F-NEXT: vmovd %eax, %xmm0 1235; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1236; AVX512F-NEXT: vmovd %ecx, %xmm1 1237; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1238; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1239; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1240; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1241; AVX512F-NEXT: retq 1242; 1243; AVX512VL-LABEL: cvt_2i16_to_2f64: 1244; AVX512VL: # %bb.0: 1245; AVX512VL-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp) 1246; AVX512VL-NEXT: movl -{{[0-9]+}}(%rsp), %eax 1247; AVX512VL-NEXT: movswl %ax, %ecx 1248; AVX512VL-NEXT: shrl $16, %eax 1249; AVX512VL-NEXT: cwtl 1250; AVX512VL-NEXT: vmovd %eax, %xmm0 1251; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1252; AVX512VL-NEXT: vmovd %ecx, %xmm1 1253; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1254; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1255; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1256; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1257; AVX512VL-NEXT: retq 1258 %1 = bitcast <2 x i16> %a0 to <2 x half> 1259 %2 = fpext <2 x half> %1 to <2 x double> 1260 ret <2 x double> %2 1261} 1262 1263define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { 1264; AVX1-LABEL: cvt_4i16_to_4f64: 1265; AVX1: # %bb.0: 1266; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1267; AVX1-NEXT: vmovq %xmm0, %rax 1268; AVX1-NEXT: movq %rax, %rcx 1269; AVX1-NEXT: movl %eax, %edx 1270; AVX1-NEXT: movswl %ax, %esi 1271; AVX1-NEXT: shrq $48, %rax 1272; AVX1-NEXT: shrq $32, %rcx 1273; AVX1-NEXT: shrl $16, %edx 1274; AVX1-NEXT: movswl %dx, %edx 1275; AVX1-NEXT: vmovd %edx, %xmm0 1276; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1277; AVX1-NEXT: vmovd %esi, %xmm1 1278; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 1279; AVX1-NEXT: movswl %cx, %ecx 1280; AVX1-NEXT: vmovd %ecx, %xmm2 1281; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 1282; AVX1-NEXT: cwtl 1283; AVX1-NEXT: vmovd %eax, %xmm3 1284; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 1285; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1286; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1287; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1288; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1289; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1290; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1291; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1292; AVX1-NEXT: retq 1293; 1294; AVX2-LABEL: cvt_4i16_to_4f64: 1295; AVX2: # %bb.0: 1296; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1297; AVX2-NEXT: vmovq %xmm0, %rax 1298; AVX2-NEXT: movq %rax, %rcx 1299; AVX2-NEXT: movl %eax, %edx 1300; AVX2-NEXT: movswl %ax, %esi 1301; AVX2-NEXT: shrq $48, %rax 1302; AVX2-NEXT: shrq $32, %rcx 1303; AVX2-NEXT: shrl $16, %edx 1304; AVX2-NEXT: movswl %dx, %edx 1305; AVX2-NEXT: vmovd %edx, %xmm0 1306; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1307; AVX2-NEXT: vmovd %esi, %xmm1 1308; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 1309; AVX2-NEXT: movswl %cx, %ecx 1310; AVX2-NEXT: vmovd %ecx, %xmm2 1311; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 1312; AVX2-NEXT: cwtl 1313; AVX2-NEXT: vmovd %eax, %xmm3 1314; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 1315; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1316; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1317; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1318; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1319; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1320; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1321; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1322; AVX2-NEXT: retq 1323; 1324; AVX512F-LABEL: cvt_4i16_to_4f64: 1325; AVX512F: # %bb.0: 1326; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1327; AVX512F-NEXT: vmovq %xmm0, %rax 1328; AVX512F-NEXT: movq %rax, %rcx 1329; AVX512F-NEXT: movl %eax, %edx 1330; AVX512F-NEXT: movswl %ax, %esi 1331; AVX512F-NEXT: shrq $48, %rax 1332; AVX512F-NEXT: shrq $32, %rcx 1333; AVX512F-NEXT: shrl $16, %edx 1334; AVX512F-NEXT: movswl %dx, %edx 1335; AVX512F-NEXT: vmovd %edx, %xmm0 1336; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1337; AVX512F-NEXT: vmovd %esi, %xmm1 1338; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1339; AVX512F-NEXT: movswl %cx, %ecx 1340; AVX512F-NEXT: vmovd %ecx, %xmm2 1341; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 1342; AVX512F-NEXT: cwtl 1343; AVX512F-NEXT: vmovd %eax, %xmm3 1344; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 1345; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1346; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1347; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1348; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1349; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1350; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1351; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1352; AVX512F-NEXT: retq 1353; 1354; AVX512VL-LABEL: cvt_4i16_to_4f64: 1355; AVX512VL: # %bb.0: 1356; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 1357; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1358; AVX512VL-NEXT: movq %rax, %rcx 1359; AVX512VL-NEXT: movl %eax, %edx 1360; AVX512VL-NEXT: movswl %ax, %esi 1361; AVX512VL-NEXT: shrq $48, %rax 1362; AVX512VL-NEXT: shrq $32, %rcx 1363; AVX512VL-NEXT: shrl $16, %edx 1364; AVX512VL-NEXT: movswl %dx, %edx 1365; AVX512VL-NEXT: vmovd %edx, %xmm0 1366; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1367; AVX512VL-NEXT: vmovd %esi, %xmm1 1368; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1369; AVX512VL-NEXT: movswl %cx, %ecx 1370; AVX512VL-NEXT: vmovd %ecx, %xmm2 1371; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 1372; AVX512VL-NEXT: cwtl 1373; AVX512VL-NEXT: vmovd %eax, %xmm3 1374; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 1375; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1376; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1377; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1378; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1379; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1380; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1381; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1382; AVX512VL-NEXT: retq 1383 %1 = bitcast <4 x i16> %a0 to <4 x half> 1384 %2 = fpext <4 x half> %1 to <4 x double> 1385 ret <4 x double> %2 1386} 1387 1388define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind { 1389; AVX1-LABEL: cvt_8i16_to_2f64: 1390; AVX1: # %bb.0: 1391; AVX1-NEXT: vmovd %xmm0, %eax 1392; AVX1-NEXT: movswl %ax, %ecx 1393; AVX1-NEXT: shrl $16, %eax 1394; AVX1-NEXT: cwtl 1395; AVX1-NEXT: vmovd %eax, %xmm0 1396; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1397; AVX1-NEXT: vmovd %ecx, %xmm1 1398; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 1399; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1400; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1401; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1402; AVX1-NEXT: retq 1403; 1404; AVX2-LABEL: cvt_8i16_to_2f64: 1405; AVX2: # %bb.0: 1406; AVX2-NEXT: vmovd %xmm0, %eax 1407; AVX2-NEXT: movswl %ax, %ecx 1408; AVX2-NEXT: shrl $16, %eax 1409; AVX2-NEXT: cwtl 1410; AVX2-NEXT: vmovd %eax, %xmm0 1411; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1412; AVX2-NEXT: vmovd %ecx, %xmm1 1413; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 1414; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1415; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1416; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1417; AVX2-NEXT: retq 1418; 1419; AVX512F-LABEL: cvt_8i16_to_2f64: 1420; AVX512F: # %bb.0: 1421; AVX512F-NEXT: vmovd %xmm0, %eax 1422; AVX512F-NEXT: movswl %ax, %ecx 1423; AVX512F-NEXT: shrl $16, %eax 1424; AVX512F-NEXT: cwtl 1425; AVX512F-NEXT: vmovd %eax, %xmm0 1426; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1427; AVX512F-NEXT: vmovd %ecx, %xmm1 1428; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1429; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1430; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1431; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1432; AVX512F-NEXT: retq 1433; 1434; AVX512VL-LABEL: cvt_8i16_to_2f64: 1435; AVX512VL: # %bb.0: 1436; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1437; AVX512VL-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp) 1438; AVX512VL-NEXT: movl -{{[0-9]+}}(%rsp), %eax 1439; AVX512VL-NEXT: movswl %ax, %ecx 1440; AVX512VL-NEXT: shrl $16, %eax 1441; AVX512VL-NEXT: cwtl 1442; AVX512VL-NEXT: vmovd %eax, %xmm0 1443; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1444; AVX512VL-NEXT: vmovd %ecx, %xmm1 1445; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1446; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1447; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1448; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1449; AVX512VL-NEXT: retq 1450 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 1451 %2 = bitcast <2 x i16> %1 to <2 x half> 1452 %3 = fpext <2 x half> %2 to <2 x double> 1453 ret <2 x double> %3 1454} 1455 1456define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { 1457; AVX1-LABEL: cvt_8i16_to_4f64: 1458; AVX1: # %bb.0: 1459; AVX1-NEXT: vmovq %xmm0, %rax 1460; AVX1-NEXT: movq %rax, %rcx 1461; AVX1-NEXT: movl %eax, %edx 1462; AVX1-NEXT: movswl %ax, %esi 1463; AVX1-NEXT: shrq $48, %rax 1464; AVX1-NEXT: shrq $32, %rcx 1465; AVX1-NEXT: shrl $16, %edx 1466; AVX1-NEXT: movswl %dx, %edx 1467; AVX1-NEXT: vmovd %edx, %xmm0 1468; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1469; AVX1-NEXT: vmovd %esi, %xmm1 1470; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 1471; AVX1-NEXT: movswl %cx, %ecx 1472; AVX1-NEXT: vmovd %ecx, %xmm2 1473; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 1474; AVX1-NEXT: cwtl 1475; AVX1-NEXT: vmovd %eax, %xmm3 1476; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 1477; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1478; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1479; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1480; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1481; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1482; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1483; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1484; AVX1-NEXT: retq 1485; 1486; AVX2-LABEL: cvt_8i16_to_4f64: 1487; AVX2: # %bb.0: 1488; AVX2-NEXT: vmovq %xmm0, %rax 1489; AVX2-NEXT: movq %rax, %rcx 1490; AVX2-NEXT: movl %eax, %edx 1491; AVX2-NEXT: movswl %ax, %esi 1492; AVX2-NEXT: shrq $48, %rax 1493; AVX2-NEXT: shrq $32, %rcx 1494; AVX2-NEXT: shrl $16, %edx 1495; AVX2-NEXT: movswl %dx, %edx 1496; AVX2-NEXT: vmovd %edx, %xmm0 1497; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1498; AVX2-NEXT: vmovd %esi, %xmm1 1499; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 1500; AVX2-NEXT: movswl %cx, %ecx 1501; AVX2-NEXT: vmovd %ecx, %xmm2 1502; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 1503; AVX2-NEXT: cwtl 1504; AVX2-NEXT: vmovd %eax, %xmm3 1505; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 1506; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1507; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1508; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1509; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1510; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1511; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1512; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1513; AVX2-NEXT: retq 1514; 1515; AVX512F-LABEL: cvt_8i16_to_4f64: 1516; AVX512F: # %bb.0: 1517; AVX512F-NEXT: vmovq %xmm0, %rax 1518; AVX512F-NEXT: movq %rax, %rcx 1519; AVX512F-NEXT: movl %eax, %edx 1520; AVX512F-NEXT: movswl %ax, %esi 1521; AVX512F-NEXT: shrq $48, %rax 1522; AVX512F-NEXT: shrq $32, %rcx 1523; AVX512F-NEXT: shrl $16, %edx 1524; AVX512F-NEXT: movswl %dx, %edx 1525; AVX512F-NEXT: vmovd %edx, %xmm0 1526; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1527; AVX512F-NEXT: vmovd %esi, %xmm1 1528; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1529; AVX512F-NEXT: movswl %cx, %ecx 1530; AVX512F-NEXT: vmovd %ecx, %xmm2 1531; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 1532; AVX512F-NEXT: cwtl 1533; AVX512F-NEXT: vmovd %eax, %xmm3 1534; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 1535; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1536; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1537; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1538; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1539; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1540; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1541; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1542; AVX512F-NEXT: retq 1543; 1544; AVX512VL-LABEL: cvt_8i16_to_4f64: 1545; AVX512VL: # %bb.0: 1546; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1547; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 1548; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1549; AVX512VL-NEXT: movq %rax, %rcx 1550; AVX512VL-NEXT: movl %eax, %edx 1551; AVX512VL-NEXT: movswl %ax, %esi 1552; AVX512VL-NEXT: shrq $48, %rax 1553; AVX512VL-NEXT: shrq $32, %rcx 1554; AVX512VL-NEXT: shrl $16, %edx 1555; AVX512VL-NEXT: movswl %dx, %edx 1556; AVX512VL-NEXT: vmovd %edx, %xmm0 1557; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1558; AVX512VL-NEXT: vmovd %esi, %xmm1 1559; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1560; AVX512VL-NEXT: movswl %cx, %ecx 1561; AVX512VL-NEXT: vmovd %ecx, %xmm2 1562; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 1563; AVX512VL-NEXT: cwtl 1564; AVX512VL-NEXT: vmovd %eax, %xmm3 1565; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 1566; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1567; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1568; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1569; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1570; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1571; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1572; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1573; AVX512VL-NEXT: retq 1574 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1575 %2 = bitcast <4 x i16> %1 to <4 x half> 1576 %3 = fpext <4 x half> %2 to <4 x double> 1577 ret <4 x double> %3 1578} 1579 1580define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { 1581; AVX1-LABEL: cvt_8i16_to_8f64: 1582; AVX1: # %bb.0: 1583; AVX1-NEXT: vmovq %xmm0, %rdx 1584; AVX1-NEXT: movq %rdx, %r9 1585; AVX1-NEXT: movl %edx, %r10d 1586; AVX1-NEXT: movswl %dx, %r8d 1587; AVX1-NEXT: shrq $48, %rdx 1588; AVX1-NEXT: shrq $32, %r9 1589; AVX1-NEXT: shrl $16, %r10d 1590; AVX1-NEXT: vpextrq $1, %xmm0, %rdi 1591; AVX1-NEXT: movq %rdi, %rsi 1592; AVX1-NEXT: movl %edi, %eax 1593; AVX1-NEXT: movswl %di, %ecx 1594; AVX1-NEXT: shrq $48, %rdi 1595; AVX1-NEXT: shrq $32, %rsi 1596; AVX1-NEXT: shrl $16, %eax 1597; AVX1-NEXT: cwtl 1598; AVX1-NEXT: vmovd %eax, %xmm0 1599; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 1600; AVX1-NEXT: vmovd %ecx, %xmm0 1601; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 1602; AVX1-NEXT: movswl %si, %eax 1603; AVX1-NEXT: vmovd %eax, %xmm0 1604; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 1605; AVX1-NEXT: movswl %di, %eax 1606; AVX1-NEXT: vmovd %eax, %xmm0 1607; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4 1608; AVX1-NEXT: movswl %r10w, %eax 1609; AVX1-NEXT: vmovd %eax, %xmm0 1610; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1611; AVX1-NEXT: vmovd %r8d, %xmm5 1612; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 1613; AVX1-NEXT: movswl %r9w, %eax 1614; AVX1-NEXT: vmovd %eax, %xmm6 1615; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 1616; AVX1-NEXT: movswl %dx, %eax 1617; AVX1-NEXT: vmovd %eax, %xmm7 1618; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 1619; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1620; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1621; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1622; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1623; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1624; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] 1625; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1626; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1627; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1628; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1629; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1630; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1631; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1632; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1633; AVX1-NEXT: retq 1634; 1635; AVX2-LABEL: cvt_8i16_to_8f64: 1636; AVX2: # %bb.0: 1637; AVX2-NEXT: vmovq %xmm0, %rdx 1638; AVX2-NEXT: movq %rdx, %r9 1639; AVX2-NEXT: movl %edx, %r10d 1640; AVX2-NEXT: movswl %dx, %r8d 1641; AVX2-NEXT: shrq $48, %rdx 1642; AVX2-NEXT: shrq $32, %r9 1643; AVX2-NEXT: shrl $16, %r10d 1644; AVX2-NEXT: vpextrq $1, %xmm0, %rdi 1645; AVX2-NEXT: movq %rdi, %rsi 1646; AVX2-NEXT: movl %edi, %eax 1647; AVX2-NEXT: movswl %di, %ecx 1648; AVX2-NEXT: shrq $48, %rdi 1649; AVX2-NEXT: shrq $32, %rsi 1650; AVX2-NEXT: shrl $16, %eax 1651; AVX2-NEXT: cwtl 1652; AVX2-NEXT: vmovd %eax, %xmm0 1653; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 1654; AVX2-NEXT: vmovd %ecx, %xmm0 1655; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 1656; AVX2-NEXT: movswl %si, %eax 1657; AVX2-NEXT: vmovd %eax, %xmm0 1658; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 1659; AVX2-NEXT: movswl %di, %eax 1660; AVX2-NEXT: vmovd %eax, %xmm0 1661; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4 1662; AVX2-NEXT: movswl %r10w, %eax 1663; AVX2-NEXT: vmovd %eax, %xmm0 1664; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1665; AVX2-NEXT: vmovd %r8d, %xmm5 1666; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 1667; AVX2-NEXT: movswl %r9w, %eax 1668; AVX2-NEXT: vmovd %eax, %xmm6 1669; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 1670; AVX2-NEXT: movswl %dx, %eax 1671; AVX2-NEXT: vmovd %eax, %xmm7 1672; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 1673; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1674; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1675; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1676; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1677; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1678; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] 1679; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1680; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1681; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1682; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1683; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1684; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1685; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1686; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1687; AVX2-NEXT: retq 1688; 1689; AVX512-LABEL: cvt_8i16_to_8f64: 1690; AVX512: # %bb.0: 1691; AVX512-NEXT: vpextrq $1, %xmm0, %rdx 1692; AVX512-NEXT: movq %rdx, %r9 1693; AVX512-NEXT: movl %edx, %r10d 1694; AVX512-NEXT: movswl %dx, %r8d 1695; AVX512-NEXT: shrq $48, %rdx 1696; AVX512-NEXT: shrq $32, %r9 1697; AVX512-NEXT: shrl $16, %r10d 1698; AVX512-NEXT: vmovq %xmm0, %rdi 1699; AVX512-NEXT: movq %rdi, %rsi 1700; AVX512-NEXT: movl %edi, %eax 1701; AVX512-NEXT: movswl %di, %ecx 1702; AVX512-NEXT: shrq $48, %rdi 1703; AVX512-NEXT: shrq $32, %rsi 1704; AVX512-NEXT: shrl $16, %eax 1705; AVX512-NEXT: cwtl 1706; AVX512-NEXT: vmovd %eax, %xmm0 1707; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1708; AVX512-NEXT: vmovd %ecx, %xmm1 1709; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 1710; AVX512-NEXT: movswl %si, %eax 1711; AVX512-NEXT: vmovd %eax, %xmm2 1712; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1713; AVX512-NEXT: movswl %di, %eax 1714; AVX512-NEXT: vmovd %eax, %xmm3 1715; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1716; AVX512-NEXT: movswl %r10w, %eax 1717; AVX512-NEXT: vmovd %eax, %xmm4 1718; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 1719; AVX512-NEXT: vmovd %r8d, %xmm5 1720; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1721; AVX512-NEXT: movswl %r9w, %eax 1722; AVX512-NEXT: vmovd %eax, %xmm6 1723; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 1724; AVX512-NEXT: movswl %dx, %eax 1725; AVX512-NEXT: vmovd %eax, %xmm7 1726; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 1727; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1728; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1729; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1730; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1731; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1732; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] 1733; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 1734; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1735; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1736; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1737; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1738; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1739; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1740; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1741; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 1742; AVX512-NEXT: retq 1743 %1 = bitcast <8 x i16> %a0 to <8 x half> 1744 %2 = fpext <8 x half> %1 to <8 x double> 1745 ret <8 x double> %2 1746} 1747 1748; 1749; Half to Double (Load) 1750; 1751 1752define double @load_cvt_i16_to_f64(i16* %a0) nounwind { 1753; ALL-LABEL: load_cvt_i16_to_f64: 1754; ALL: # %bb.0: 1755; ALL-NEXT: movswl (%rdi), %eax 1756; ALL-NEXT: vmovd %eax, %xmm0 1757; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1758; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1759; ALL-NEXT: retq 1760 %1 = load i16, i16* %a0 1761 %2 = bitcast i16 %1 to half 1762 %3 = fpext half %2 to double 1763 ret double %3 1764} 1765 1766define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind { 1767; ALL-LABEL: load_cvt_2i16_to_2f64: 1768; ALL: # %bb.0: 1769; ALL-NEXT: movswl (%rdi), %eax 1770; ALL-NEXT: vmovd %eax, %xmm0 1771; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1772; ALL-NEXT: movswl 2(%rdi), %eax 1773; ALL-NEXT: vmovd %eax, %xmm1 1774; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1775; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1776; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1777; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1778; ALL-NEXT: retq 1779 %1 = load <2 x i16>, <2 x i16>* %a0 1780 %2 = bitcast <2 x i16> %1 to <2 x half> 1781 %3 = fpext <2 x half> %2 to <2 x double> 1782 ret <2 x double> %3 1783} 1784 1785define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind { 1786; ALL-LABEL: load_cvt_4i16_to_4f64: 1787; ALL: # %bb.0: 1788; ALL-NEXT: movswl (%rdi), %eax 1789; ALL-NEXT: vmovd %eax, %xmm0 1790; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1791; ALL-NEXT: movswl 2(%rdi), %eax 1792; ALL-NEXT: vmovd %eax, %xmm1 1793; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1794; ALL-NEXT: movswl 4(%rdi), %eax 1795; ALL-NEXT: vmovd %eax, %xmm2 1796; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 1797; ALL-NEXT: movswl 6(%rdi), %eax 1798; ALL-NEXT: vmovd %eax, %xmm3 1799; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 1800; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1801; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1802; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1803; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1804; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1805; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1806; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1807; ALL-NEXT: retq 1808 %1 = load <4 x i16>, <4 x i16>* %a0 1809 %2 = bitcast <4 x i16> %1 to <4 x half> 1810 %3 = fpext <4 x half> %2 to <4 x double> 1811 ret <4 x double> %3 1812} 1813 1814define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind { 1815; AVX1-LABEL: load_cvt_8i16_to_4f64: 1816; AVX1: # %bb.0: 1817; AVX1-NEXT: movq (%rdi), %rax 1818; AVX1-NEXT: movq %rax, %rcx 1819; AVX1-NEXT: movl %eax, %edx 1820; AVX1-NEXT: movswl %ax, %esi 1821; AVX1-NEXT: shrq $48, %rax 1822; AVX1-NEXT: shrq $32, %rcx 1823; AVX1-NEXT: shrl $16, %edx 1824; AVX1-NEXT: movswl %dx, %edx 1825; AVX1-NEXT: vmovd %edx, %xmm0 1826; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1827; AVX1-NEXT: vmovd %esi, %xmm1 1828; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 1829; AVX1-NEXT: movswl %cx, %ecx 1830; AVX1-NEXT: vmovd %ecx, %xmm2 1831; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 1832; AVX1-NEXT: cwtl 1833; AVX1-NEXT: vmovd %eax, %xmm3 1834; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 1835; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1836; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1837; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1838; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1839; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1840; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1841; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1842; AVX1-NEXT: retq 1843; 1844; AVX2-LABEL: load_cvt_8i16_to_4f64: 1845; AVX2: # %bb.0: 1846; AVX2-NEXT: movq (%rdi), %rax 1847; AVX2-NEXT: movq %rax, %rcx 1848; AVX2-NEXT: movl %eax, %edx 1849; AVX2-NEXT: movswl %ax, %esi 1850; AVX2-NEXT: shrq $48, %rax 1851; AVX2-NEXT: shrq $32, %rcx 1852; AVX2-NEXT: shrl $16, %edx 1853; AVX2-NEXT: movswl %dx, %edx 1854; AVX2-NEXT: vmovd %edx, %xmm0 1855; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1856; AVX2-NEXT: vmovd %esi, %xmm1 1857; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 1858; AVX2-NEXT: movswl %cx, %ecx 1859; AVX2-NEXT: vmovd %ecx, %xmm2 1860; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 1861; AVX2-NEXT: cwtl 1862; AVX2-NEXT: vmovd %eax, %xmm3 1863; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 1864; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1865; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1866; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1867; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1868; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1869; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1870; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1871; AVX2-NEXT: retq 1872; 1873; AVX512F-LABEL: load_cvt_8i16_to_4f64: 1874; AVX512F: # %bb.0: 1875; AVX512F-NEXT: movq (%rdi), %rax 1876; AVX512F-NEXT: movq %rax, %rcx 1877; AVX512F-NEXT: movl %eax, %edx 1878; AVX512F-NEXT: movswl %ax, %esi 1879; AVX512F-NEXT: shrq $48, %rax 1880; AVX512F-NEXT: shrq $32, %rcx 1881; AVX512F-NEXT: shrl $16, %edx 1882; AVX512F-NEXT: movswl %dx, %edx 1883; AVX512F-NEXT: vmovd %edx, %xmm0 1884; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 1885; AVX512F-NEXT: vmovd %esi, %xmm1 1886; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 1887; AVX512F-NEXT: movswl %cx, %ecx 1888; AVX512F-NEXT: vmovd %ecx, %xmm2 1889; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 1890; AVX512F-NEXT: cwtl 1891; AVX512F-NEXT: vmovd %eax, %xmm3 1892; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 1893; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1894; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1895; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1896; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1897; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1898; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1899; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1900; AVX512F-NEXT: retq 1901; 1902; AVX512VL-LABEL: load_cvt_8i16_to_4f64: 1903; AVX512VL: # %bb.0: 1904; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1905; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp) 1906; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1907; AVX512VL-NEXT: movq %rax, %rcx 1908; AVX512VL-NEXT: movl %eax, %edx 1909; AVX512VL-NEXT: movswl %ax, %esi 1910; AVX512VL-NEXT: shrq $48, %rax 1911; AVX512VL-NEXT: shrq $32, %rcx 1912; AVX512VL-NEXT: shrl $16, %edx 1913; AVX512VL-NEXT: movswl %dx, %edx 1914; AVX512VL-NEXT: vmovd %edx, %xmm0 1915; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 1916; AVX512VL-NEXT: vmovd %esi, %xmm1 1917; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 1918; AVX512VL-NEXT: movswl %cx, %ecx 1919; AVX512VL-NEXT: vmovd %ecx, %xmm2 1920; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 1921; AVX512VL-NEXT: cwtl 1922; AVX512VL-NEXT: vmovd %eax, %xmm3 1923; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 1924; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1925; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1926; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1927; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1928; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1929; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1930; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1931; AVX512VL-NEXT: retq 1932 %1 = load <8 x i16>, <8 x i16>* %a0 1933 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1934 %3 = bitcast <4 x i16> %2 to <4 x half> 1935 %4 = fpext <4 x half> %3 to <4 x double> 1936 ret <4 x double> %4 1937} 1938 1939define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind { 1940; AVX1-LABEL: load_cvt_8i16_to_8f64: 1941; AVX1: # %bb.0: 1942; AVX1-NEXT: movswl 8(%rdi), %eax 1943; AVX1-NEXT: vmovd %eax, %xmm0 1944; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 1945; AVX1-NEXT: movswl 10(%rdi), %eax 1946; AVX1-NEXT: vmovd %eax, %xmm0 1947; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 1948; AVX1-NEXT: movswl 12(%rdi), %eax 1949; AVX1-NEXT: vmovd %eax, %xmm0 1950; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 1951; AVX1-NEXT: movswl 14(%rdi), %eax 1952; AVX1-NEXT: vmovd %eax, %xmm0 1953; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4 1954; AVX1-NEXT: movswl (%rdi), %eax 1955; AVX1-NEXT: vmovd %eax, %xmm0 1956; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1957; AVX1-NEXT: movswl 2(%rdi), %eax 1958; AVX1-NEXT: vmovd %eax, %xmm5 1959; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 1960; AVX1-NEXT: movswl 4(%rdi), %eax 1961; AVX1-NEXT: vmovd %eax, %xmm6 1962; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 1963; AVX1-NEXT: movswl 6(%rdi), %eax 1964; AVX1-NEXT: vmovd %eax, %xmm7 1965; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 1966; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1967; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1968; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1969; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1970; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1971; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] 1972; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1973; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1974; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1975; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1976; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1977; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1978; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1979; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1980; AVX1-NEXT: retq 1981; 1982; AVX2-LABEL: load_cvt_8i16_to_8f64: 1983; AVX2: # %bb.0: 1984; AVX2-NEXT: movswl 8(%rdi), %eax 1985; AVX2-NEXT: vmovd %eax, %xmm0 1986; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 1987; AVX2-NEXT: movswl 10(%rdi), %eax 1988; AVX2-NEXT: vmovd %eax, %xmm0 1989; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 1990; AVX2-NEXT: movswl 12(%rdi), %eax 1991; AVX2-NEXT: vmovd %eax, %xmm0 1992; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 1993; AVX2-NEXT: movswl 14(%rdi), %eax 1994; AVX2-NEXT: vmovd %eax, %xmm0 1995; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4 1996; AVX2-NEXT: movswl (%rdi), %eax 1997; AVX2-NEXT: vmovd %eax, %xmm0 1998; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1999; AVX2-NEXT: movswl 2(%rdi), %eax 2000; AVX2-NEXT: vmovd %eax, %xmm5 2001; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 2002; AVX2-NEXT: movswl 4(%rdi), %eax 2003; AVX2-NEXT: vmovd %eax, %xmm6 2004; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 2005; AVX2-NEXT: movswl 6(%rdi), %eax 2006; AVX2-NEXT: vmovd %eax, %xmm7 2007; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 2008; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 2009; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 2010; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 2011; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 2012; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2013; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] 2014; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 2015; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 2016; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 2017; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] 2018; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 2019; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 2020; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2021; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 2022; AVX2-NEXT: retq 2023; 2024; AVX512-LABEL: load_cvt_8i16_to_8f64: 2025; AVX512: # %bb.0: 2026; AVX512-NEXT: movswl (%rdi), %eax 2027; AVX512-NEXT: vmovd %eax, %xmm0 2028; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 2029; AVX512-NEXT: movswl 2(%rdi), %eax 2030; AVX512-NEXT: vmovd %eax, %xmm1 2031; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 2032; AVX512-NEXT: movswl 4(%rdi), %eax 2033; AVX512-NEXT: vmovd %eax, %xmm2 2034; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 2035; AVX512-NEXT: movswl 6(%rdi), %eax 2036; AVX512-NEXT: vmovd %eax, %xmm3 2037; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 2038; AVX512-NEXT: movswl 8(%rdi), %eax 2039; AVX512-NEXT: vmovd %eax, %xmm4 2040; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 2041; AVX512-NEXT: movswl 10(%rdi), %eax 2042; AVX512-NEXT: vmovd %eax, %xmm5 2043; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 2044; AVX512-NEXT: movswl 12(%rdi), %eax 2045; AVX512-NEXT: vmovd %eax, %xmm6 2046; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 2047; AVX512-NEXT: movswl 14(%rdi), %eax 2048; AVX512-NEXT: vmovd %eax, %xmm7 2049; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 2050; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 2051; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 2052; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] 2053; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 2054; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 2055; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] 2056; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 2057; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 2058; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 2059; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 2060; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 2061; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 2062; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2063; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2064; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 2065; AVX512-NEXT: retq 2066 %1 = load <8 x i16>, <8 x i16>* %a0 2067 %2 = bitcast <8 x i16> %1 to <8 x half> 2068 %3 = fpext <8 x half> %2 to <8 x double> 2069 ret <8 x double> %3 2070} 2071 2072; 2073; Float to Half 2074; 2075 2076define i16 @cvt_f32_to_i16(float %a0) nounwind { 2077; ALL-LABEL: cvt_f32_to_i16: 2078; ALL: # %bb.0: 2079; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2080; ALL-NEXT: vmovd %xmm0, %eax 2081; ALL-NEXT: # kill: def $ax killed $ax killed $eax 2082; ALL-NEXT: retq 2083 %1 = fptrunc float %a0 to half 2084 %2 = bitcast half %1 to i16 2085 ret i16 %2 2086} 2087 2088define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind { 2089; ALL-LABEL: cvt_4f32_to_4i16: 2090; ALL: # %bb.0: 2091; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2092; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2093; ALL-NEXT: vmovd %xmm1, %eax 2094; ALL-NEXT: shll $16, %eax 2095; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2096; ALL-NEXT: vmovd %xmm1, %ecx 2097; ALL-NEXT: movzwl %cx, %ecx 2098; ALL-NEXT: orl %eax, %ecx 2099; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2100; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2101; ALL-NEXT: vmovd %xmm1, %eax 2102; ALL-NEXT: shll $16, %eax 2103; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2104; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2105; ALL-NEXT: vmovd %xmm0, %edx 2106; ALL-NEXT: movzwl %dx, %edx 2107; ALL-NEXT: orl %eax, %edx 2108; ALL-NEXT: shlq $32, %rdx 2109; ALL-NEXT: orq %rcx, %rdx 2110; ALL-NEXT: vmovq %rdx, %xmm0 2111; ALL-NEXT: retq 2112 %1 = fptrunc <4 x float> %a0 to <4 x half> 2113 %2 = bitcast <4 x half> %1 to <4 x i16> 2114 ret <4 x i16> %2 2115} 2116 2117define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { 2118; ALL-LABEL: cvt_4f32_to_8i16_undef: 2119; ALL: # %bb.0: 2120; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2121; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2122; ALL-NEXT: vmovd %xmm1, %eax 2123; ALL-NEXT: shll $16, %eax 2124; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2125; ALL-NEXT: vmovd %xmm1, %ecx 2126; ALL-NEXT: movzwl %cx, %ecx 2127; ALL-NEXT: orl %eax, %ecx 2128; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2129; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2130; ALL-NEXT: vmovd %xmm1, %eax 2131; ALL-NEXT: shll $16, %eax 2132; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2133; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2134; ALL-NEXT: vmovd %xmm0, %edx 2135; ALL-NEXT: movzwl %dx, %edx 2136; ALL-NEXT: orl %eax, %edx 2137; ALL-NEXT: shlq $32, %rdx 2138; ALL-NEXT: orq %rcx, %rdx 2139; ALL-NEXT: vmovq %rdx, %xmm0 2140; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2141; ALL-NEXT: retq 2142 %1 = fptrunc <4 x float> %a0 to <4 x half> 2143 %2 = bitcast <4 x half> %1 to <4 x i16> 2144 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2145 ret <8 x i16> %3 2146} 2147 2148define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { 2149; AVX1-LABEL: cvt_4f32_to_8i16_zero: 2150; AVX1: # %bb.0: 2151; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2152; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2153; AVX1-NEXT: vmovd %xmm1, %eax 2154; AVX1-NEXT: shll $16, %eax 2155; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2156; AVX1-NEXT: vmovd %xmm1, %ecx 2157; AVX1-NEXT: movzwl %cx, %ecx 2158; AVX1-NEXT: orl %eax, %ecx 2159; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2160; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2161; AVX1-NEXT: vmovd %xmm1, %eax 2162; AVX1-NEXT: shll $16, %eax 2163; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2164; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2165; AVX1-NEXT: vmovd %xmm0, %edx 2166; AVX1-NEXT: movzwl %dx, %edx 2167; AVX1-NEXT: orl %eax, %edx 2168; AVX1-NEXT: shlq $32, %rdx 2169; AVX1-NEXT: orq %rcx, %rdx 2170; AVX1-NEXT: vmovq %rdx, %xmm0 2171; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2172; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2173; AVX1-NEXT: retq 2174; 2175; AVX2-SLOW-LABEL: cvt_4f32_to_8i16_zero: 2176; AVX2-SLOW: # %bb.0: 2177; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2178; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2179; AVX2-SLOW-NEXT: vmovd %xmm1, %eax 2180; AVX2-SLOW-NEXT: shll $16, %eax 2181; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2182; AVX2-SLOW-NEXT: vmovd %xmm1, %ecx 2183; AVX2-SLOW-NEXT: movzwl %cx, %ecx 2184; AVX2-SLOW-NEXT: orl %eax, %ecx 2185; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2186; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2187; AVX2-SLOW-NEXT: vmovd %xmm1, %eax 2188; AVX2-SLOW-NEXT: shll $16, %eax 2189; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2190; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2191; AVX2-SLOW-NEXT: vmovd %xmm0, %edx 2192; AVX2-SLOW-NEXT: movzwl %dx, %edx 2193; AVX2-SLOW-NEXT: orl %eax, %edx 2194; AVX2-SLOW-NEXT: shlq $32, %rdx 2195; AVX2-SLOW-NEXT: orq %rcx, %rdx 2196; AVX2-SLOW-NEXT: vmovq %rdx, %xmm0 2197; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2198; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2199; AVX2-SLOW-NEXT: retq 2200; 2201; AVX2-FAST-LABEL: cvt_4f32_to_8i16_zero: 2202; AVX2-FAST: # %bb.0: 2203; AVX2-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2204; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2205; AVX2-FAST-NEXT: vmovd %xmm1, %eax 2206; AVX2-FAST-NEXT: shll $16, %eax 2207; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2208; AVX2-FAST-NEXT: vmovd %xmm1, %ecx 2209; AVX2-FAST-NEXT: movzwl %cx, %ecx 2210; AVX2-FAST-NEXT: orl %eax, %ecx 2211; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2212; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2213; AVX2-FAST-NEXT: vmovd %xmm1, %eax 2214; AVX2-FAST-NEXT: shll $16, %eax 2215; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2216; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2217; AVX2-FAST-NEXT: vmovd %xmm0, %edx 2218; AVX2-FAST-NEXT: movzwl %dx, %edx 2219; AVX2-FAST-NEXT: orl %eax, %edx 2220; AVX2-FAST-NEXT: shlq $32, %rdx 2221; AVX2-FAST-NEXT: orq %rcx, %rdx 2222; AVX2-FAST-NEXT: vmovq %rdx, %xmm0 2223; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 2224; AVX2-FAST-NEXT: retq 2225; 2226; AVX512F-LABEL: cvt_4f32_to_8i16_zero: 2227; AVX512F: # %bb.0: 2228; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2229; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2230; AVX512F-NEXT: vmovd %xmm1, %eax 2231; AVX512F-NEXT: shll $16, %eax 2232; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2233; AVX512F-NEXT: vmovd %xmm1, %ecx 2234; AVX512F-NEXT: movzwl %cx, %ecx 2235; AVX512F-NEXT: orl %eax, %ecx 2236; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2237; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2238; AVX512F-NEXT: vmovd %xmm1, %eax 2239; AVX512F-NEXT: shll $16, %eax 2240; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2241; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2242; AVX512F-NEXT: vmovd %xmm0, %edx 2243; AVX512F-NEXT: movzwl %dx, %edx 2244; AVX512F-NEXT: orl %eax, %edx 2245; AVX512F-NEXT: shlq $32, %rdx 2246; AVX512F-NEXT: orq %rcx, %rdx 2247; AVX512F-NEXT: vmovq %rdx, %xmm0 2248; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2249; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2250; AVX512F-NEXT: retq 2251; 2252; AVX512VL-LABEL: cvt_4f32_to_8i16_zero: 2253; AVX512VL: # %bb.0: 2254; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2255; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2256; AVX512VL-NEXT: vmovd %xmm1, %eax 2257; AVX512VL-NEXT: shll $16, %eax 2258; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2259; AVX512VL-NEXT: vmovd %xmm1, %ecx 2260; AVX512VL-NEXT: movzwl %cx, %ecx 2261; AVX512VL-NEXT: orl %eax, %ecx 2262; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2263; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2264; AVX512VL-NEXT: vmovd %xmm1, %eax 2265; AVX512VL-NEXT: shll $16, %eax 2266; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2267; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2268; AVX512VL-NEXT: vmovd %xmm0, %edx 2269; AVX512VL-NEXT: movzwl %dx, %edx 2270; AVX512VL-NEXT: orl %eax, %edx 2271; AVX512VL-NEXT: shlq $32, %rdx 2272; AVX512VL-NEXT: orq %rcx, %rdx 2273; AVX512VL-NEXT: vmovq %rdx, %xmm0 2274; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 2275; AVX512VL-NEXT: retq 2276 %1 = fptrunc <4 x float> %a0 to <4 x half> 2277 %2 = bitcast <4 x half> %1 to <4 x i16> 2278 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2279 ret <8 x i16> %3 2280} 2281 2282define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { 2283; ALL-LABEL: cvt_8f32_to_8i16: 2284; ALL: # %bb.0: 2285; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2286; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2287; ALL-NEXT: vmovd %xmm1, %eax 2288; ALL-NEXT: shll $16, %eax 2289; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2290; ALL-NEXT: vmovd %xmm1, %ecx 2291; ALL-NEXT: movzwl %cx, %ecx 2292; ALL-NEXT: orl %eax, %ecx 2293; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2294; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2295; ALL-NEXT: vmovd %xmm1, %edx 2296; ALL-NEXT: shll $16, %edx 2297; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2298; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2299; ALL-NEXT: vmovd %xmm1, %eax 2300; ALL-NEXT: movzwl %ax, %eax 2301; ALL-NEXT: orl %edx, %eax 2302; ALL-NEXT: shlq $32, %rax 2303; ALL-NEXT: orq %rcx, %rax 2304; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 2305; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2306; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2307; ALL-NEXT: vmovd %xmm1, %ecx 2308; ALL-NEXT: shll $16, %ecx 2309; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2310; ALL-NEXT: vmovd %xmm1, %edx 2311; ALL-NEXT: movzwl %dx, %edx 2312; ALL-NEXT: orl %ecx, %edx 2313; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2314; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2315; ALL-NEXT: vmovd %xmm1, %ecx 2316; ALL-NEXT: shll $16, %ecx 2317; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2318; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2319; ALL-NEXT: vmovd %xmm0, %esi 2320; ALL-NEXT: movzwl %si, %esi 2321; ALL-NEXT: orl %ecx, %esi 2322; ALL-NEXT: shlq $32, %rsi 2323; ALL-NEXT: orq %rdx, %rsi 2324; ALL-NEXT: vmovq %rsi, %xmm0 2325; ALL-NEXT: vmovq %rax, %xmm1 2326; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2327; ALL-NEXT: vzeroupper 2328; ALL-NEXT: retq 2329 %1 = fptrunc <8 x float> %a0 to <8 x half> 2330 %2 = bitcast <8 x half> %1 to <8 x i16> 2331 ret <8 x i16> %2 2332} 2333 2334define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { 2335; AVX1-LABEL: cvt_16f32_to_16i16: 2336; AVX1: # %bb.0: 2337; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm2 2338; AVX1-NEXT: vmovd %xmm2, %eax 2339; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2340; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2341; AVX1-NEXT: vmovd %eax, %xmm3 2342; AVX1-NEXT: vmovd %xmm2, %eax 2343; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2344; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2345; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2346; AVX1-NEXT: vmovd %xmm2, %eax 2347; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2348; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 2349; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2350; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2351; AVX1-NEXT: vmovd %xmm1, %eax 2352; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm1 2353; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2354; AVX1-NEXT: vmovd %xmm1, %eax 2355; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2356; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2357; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2358; AVX1-NEXT: vmovd %xmm1, %eax 2359; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 2360; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2361; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2362; AVX1-NEXT: vmovd %xmm1, %eax 2363; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2364; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 2365; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2366; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 2367; AVX1-NEXT: vmovd %xmm2, %eax 2368; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 2369; AVX1-NEXT: vmovd %xmm1, %eax 2370; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2371; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2372; AVX1-NEXT: vmovd %eax, %xmm3 2373; AVX1-NEXT: vmovd %xmm1, %eax 2374; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2375; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2376; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2377; AVX1-NEXT: vmovd %xmm1, %eax 2378; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2379; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2380; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2381; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2382; AVX1-NEXT: vmovd %xmm0, %eax 2383; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2384; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2385; AVX1-NEXT: vmovd %xmm0, %eax 2386; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2387; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2388; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2389; AVX1-NEXT: vmovd %xmm0, %eax 2390; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 2391; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2392; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 2393; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2394; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2395; AVX1-NEXT: vmovd %xmm1, %eax 2396; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 2397; AVX1-NEXT: vmovd %xmm0, %eax 2398; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 2399; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2400; AVX1-NEXT: retq 2401; 2402; AVX2-LABEL: cvt_16f32_to_16i16: 2403; AVX2: # %bb.0: 2404; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm2 2405; AVX2-NEXT: vmovd %xmm2, %eax 2406; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2407; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2408; AVX2-NEXT: vmovd %eax, %xmm3 2409; AVX2-NEXT: vmovd %xmm2, %eax 2410; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2411; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2412; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2413; AVX2-NEXT: vmovd %xmm2, %eax 2414; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 2415; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 2416; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2417; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2418; AVX2-NEXT: vmovd %xmm1, %eax 2419; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm1 2420; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2421; AVX2-NEXT: vmovd %xmm1, %eax 2422; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2423; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2424; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2425; AVX2-NEXT: vmovd %xmm1, %eax 2426; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 2427; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2428; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2429; AVX2-NEXT: vmovd %xmm1, %eax 2430; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2431; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 2432; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2433; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 2434; AVX2-NEXT: vmovd %xmm2, %eax 2435; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 2436; AVX2-NEXT: vmovd %xmm1, %eax 2437; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2438; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2439; AVX2-NEXT: vmovd %eax, %xmm3 2440; AVX2-NEXT: vmovd %xmm1, %eax 2441; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2442; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2443; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2444; AVX2-NEXT: vmovd %xmm1, %eax 2445; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 2446; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2447; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2448; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2449; AVX2-NEXT: vmovd %xmm0, %eax 2450; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2451; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2452; AVX2-NEXT: vmovd %xmm0, %eax 2453; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2454; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2455; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2456; AVX2-NEXT: vmovd %xmm0, %eax 2457; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 2458; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2459; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 2460; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2461; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2462; AVX2-NEXT: vmovd %xmm1, %eax 2463; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 2464; AVX2-NEXT: vmovd %xmm0, %eax 2465; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 2466; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2467; AVX2-NEXT: retq 2468; 2469; AVX512-LABEL: cvt_16f32_to_16i16: 2470; AVX512: # %bb.0: 2471; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 2472; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2 2473; AVX512-NEXT: vmovd %xmm2, %eax 2474; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2475; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2476; AVX512-NEXT: vmovd %eax, %xmm3 2477; AVX512-NEXT: vmovd %xmm2, %eax 2478; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2479; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2480; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2481; AVX512-NEXT: vmovd %xmm2, %eax 2482; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 2483; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 2484; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2485; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2486; AVX512-NEXT: vmovd %xmm1, %eax 2487; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 2488; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2489; AVX512-NEXT: vmovd %xmm1, %eax 2490; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2491; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2492; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2493; AVX512-NEXT: vmovd %xmm1, %eax 2494; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 2495; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2496; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2497; AVX512-NEXT: vmovd %xmm1, %eax 2498; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2499; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 2500; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2501; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 2502; AVX512-NEXT: vmovd %xmm2, %eax 2503; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 2504; AVX512-NEXT: vmovd %xmm1, %eax 2505; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2506; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2507; AVX512-NEXT: vmovd %eax, %xmm3 2508; AVX512-NEXT: vmovd %xmm1, %eax 2509; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2510; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2511; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 2512; AVX512-NEXT: vmovd %xmm1, %eax 2513; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 2514; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2515; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2516; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 2517; AVX512-NEXT: vmovd %xmm0, %eax 2518; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2519; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 2520; AVX512-NEXT: vmovd %xmm0, %eax 2521; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2522; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2523; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 2524; AVX512-NEXT: vmovd %xmm0, %eax 2525; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] 2526; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2527; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 2528; AVX512-NEXT: vmovd %xmm0, %eax 2529; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 2530; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2531; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 2532; AVX512-NEXT: vmovd %xmm0, %eax 2533; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 2534; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2535; AVX512-NEXT: retq 2536 %1 = fptrunc <16 x float> %a0 to <16 x half> 2537 %2 = bitcast <16 x half> %1 to <16 x i16> 2538 ret <16 x i16> %2 2539} 2540 2541; 2542; Float to Half (Store) 2543; 2544 2545define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind { 2546; ALL-LABEL: store_cvt_f32_to_i16: 2547; ALL: # %bb.0: 2548; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2549; ALL-NEXT: vmovd %xmm0, %eax 2550; ALL-NEXT: movw %ax, (%rdi) 2551; ALL-NEXT: retq 2552 %1 = fptrunc float %a0 to half 2553 %2 = bitcast half %1 to i16 2554 store i16 %2, i16* %a1 2555 ret void 2556} 2557 2558define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind { 2559; ALL-LABEL: store_cvt_4f32_to_4i16: 2560; ALL: # %bb.0: 2561; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2562; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2563; ALL-NEXT: vmovd %xmm1, %eax 2564; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2565; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2566; ALL-NEXT: vmovd %xmm1, %ecx 2567; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2568; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2569; ALL-NEXT: vmovd %xmm1, %edx 2570; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2571; ALL-NEXT: vmovd %xmm0, %esi 2572; ALL-NEXT: movw %si, (%rdi) 2573; ALL-NEXT: movw %dx, 6(%rdi) 2574; ALL-NEXT: movw %cx, 4(%rdi) 2575; ALL-NEXT: movw %ax, 2(%rdi) 2576; ALL-NEXT: retq 2577 %1 = fptrunc <4 x float> %a0 to <4 x half> 2578 %2 = bitcast <4 x half> %1 to <4 x i16> 2579 store <4 x i16> %2, <4 x i16>* %a1 2580 ret void 2581} 2582 2583define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind { 2584; ALL-LABEL: store_cvt_4f32_to_8i16_undef: 2585; ALL: # %bb.0: 2586; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2587; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2588; ALL-NEXT: vmovd %xmm1, %eax 2589; ALL-NEXT: shll $16, %eax 2590; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2591; ALL-NEXT: vmovd %xmm1, %ecx 2592; ALL-NEXT: movzwl %cx, %ecx 2593; ALL-NEXT: orl %eax, %ecx 2594; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2595; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2596; ALL-NEXT: vmovd %xmm1, %eax 2597; ALL-NEXT: shll $16, %eax 2598; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2599; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2600; ALL-NEXT: vmovd %xmm0, %edx 2601; ALL-NEXT: movzwl %dx, %edx 2602; ALL-NEXT: orl %eax, %edx 2603; ALL-NEXT: shlq $32, %rdx 2604; ALL-NEXT: orq %rcx, %rdx 2605; ALL-NEXT: vmovq %rdx, %xmm0 2606; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2607; ALL-NEXT: vmovdqa %xmm0, (%rdi) 2608; ALL-NEXT: retq 2609 %1 = fptrunc <4 x float> %a0 to <4 x half> 2610 %2 = bitcast <4 x half> %1 to <4 x i16> 2611 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2612 store <8 x i16> %3, <8 x i16>* %a1 2613 ret void 2614} 2615 2616define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind { 2617; AVX1-LABEL: store_cvt_4f32_to_8i16_zero: 2618; AVX1: # %bb.0: 2619; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2620; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2621; AVX1-NEXT: vmovd %xmm1, %eax 2622; AVX1-NEXT: shll $16, %eax 2623; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2624; AVX1-NEXT: vmovd %xmm1, %ecx 2625; AVX1-NEXT: movzwl %cx, %ecx 2626; AVX1-NEXT: orl %eax, %ecx 2627; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2628; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2629; AVX1-NEXT: vmovd %xmm1, %eax 2630; AVX1-NEXT: shll $16, %eax 2631; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2632; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2633; AVX1-NEXT: vmovd %xmm0, %edx 2634; AVX1-NEXT: movzwl %dx, %edx 2635; AVX1-NEXT: orl %eax, %edx 2636; AVX1-NEXT: shlq $32, %rdx 2637; AVX1-NEXT: orq %rcx, %rdx 2638; AVX1-NEXT: vmovq %rdx, %xmm0 2639; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2640; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2641; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 2642; AVX1-NEXT: retq 2643; 2644; AVX2-SLOW-LABEL: store_cvt_4f32_to_8i16_zero: 2645; AVX2-SLOW: # %bb.0: 2646; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2647; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2648; AVX2-SLOW-NEXT: vmovd %xmm1, %eax 2649; AVX2-SLOW-NEXT: shll $16, %eax 2650; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2651; AVX2-SLOW-NEXT: vmovd %xmm1, %ecx 2652; AVX2-SLOW-NEXT: movzwl %cx, %ecx 2653; AVX2-SLOW-NEXT: orl %eax, %ecx 2654; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2655; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2656; AVX2-SLOW-NEXT: vmovd %xmm1, %eax 2657; AVX2-SLOW-NEXT: shll $16, %eax 2658; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2659; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2660; AVX2-SLOW-NEXT: vmovd %xmm0, %edx 2661; AVX2-SLOW-NEXT: movzwl %dx, %edx 2662; AVX2-SLOW-NEXT: orl %eax, %edx 2663; AVX2-SLOW-NEXT: shlq $32, %rdx 2664; AVX2-SLOW-NEXT: orq %rcx, %rdx 2665; AVX2-SLOW-NEXT: vmovq %rdx, %xmm0 2666; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2667; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2668; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi) 2669; AVX2-SLOW-NEXT: retq 2670; 2671; AVX2-FAST-LABEL: store_cvt_4f32_to_8i16_zero: 2672; AVX2-FAST: # %bb.0: 2673; AVX2-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2674; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2675; AVX2-FAST-NEXT: vmovd %xmm1, %eax 2676; AVX2-FAST-NEXT: shll $16, %eax 2677; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2678; AVX2-FAST-NEXT: vmovd %xmm1, %ecx 2679; AVX2-FAST-NEXT: movzwl %cx, %ecx 2680; AVX2-FAST-NEXT: orl %eax, %ecx 2681; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2682; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2683; AVX2-FAST-NEXT: vmovd %xmm1, %eax 2684; AVX2-FAST-NEXT: shll $16, %eax 2685; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2686; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2687; AVX2-FAST-NEXT: vmovd %xmm0, %edx 2688; AVX2-FAST-NEXT: movzwl %dx, %edx 2689; AVX2-FAST-NEXT: orl %eax, %edx 2690; AVX2-FAST-NEXT: shlq $32, %rdx 2691; AVX2-FAST-NEXT: orq %rcx, %rdx 2692; AVX2-FAST-NEXT: vmovq %rdx, %xmm0 2693; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 2694; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rdi) 2695; AVX2-FAST-NEXT: retq 2696; 2697; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero: 2698; AVX512F: # %bb.0: 2699; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2700; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2701; AVX512F-NEXT: vmovd %xmm1, %eax 2702; AVX512F-NEXT: shll $16, %eax 2703; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2704; AVX512F-NEXT: vmovd %xmm1, %ecx 2705; AVX512F-NEXT: movzwl %cx, %ecx 2706; AVX512F-NEXT: orl %eax, %ecx 2707; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2708; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2709; AVX512F-NEXT: vmovd %xmm1, %eax 2710; AVX512F-NEXT: shll $16, %eax 2711; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2712; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2713; AVX512F-NEXT: vmovd %xmm0, %edx 2714; AVX512F-NEXT: movzwl %dx, %edx 2715; AVX512F-NEXT: orl %eax, %edx 2716; AVX512F-NEXT: shlq $32, %rdx 2717; AVX512F-NEXT: orq %rcx, %rdx 2718; AVX512F-NEXT: vmovq %rdx, %xmm0 2719; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2720; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 2721; AVX512F-NEXT: vmovdqa %xmm0, (%rdi) 2722; AVX512F-NEXT: retq 2723; 2724; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero: 2725; AVX512VL: # %bb.0: 2726; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2727; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2728; AVX512VL-NEXT: vmovd %xmm1, %eax 2729; AVX512VL-NEXT: shll $16, %eax 2730; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 2731; AVX512VL-NEXT: vmovd %xmm1, %ecx 2732; AVX512VL-NEXT: movzwl %cx, %ecx 2733; AVX512VL-NEXT: orl %eax, %ecx 2734; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2735; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2736; AVX512VL-NEXT: vmovd %xmm1, %eax 2737; AVX512VL-NEXT: shll $16, %eax 2738; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2739; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2740; AVX512VL-NEXT: vmovd %xmm0, %edx 2741; AVX512VL-NEXT: movzwl %dx, %edx 2742; AVX512VL-NEXT: orl %eax, %edx 2743; AVX512VL-NEXT: shlq $32, %rdx 2744; AVX512VL-NEXT: orq %rcx, %rdx 2745; AVX512VL-NEXT: vmovq %rdx, %xmm0 2746; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 2747; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi) 2748; AVX512VL-NEXT: retq 2749 %1 = fptrunc <4 x float> %a0 to <4 x half> 2750 %2 = bitcast <4 x half> %1 to <4 x i16> 2751 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2752 store <8 x i16> %3, <8 x i16>* %a1 2753 ret void 2754} 2755 2756define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind { 2757; ALL-LABEL: store_cvt_8f32_to_8i16: 2758; ALL: # %bb.0: 2759; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2760; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2761; ALL-NEXT: vmovd %xmm1, %r8d 2762; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2763; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2764; ALL-NEXT: vmovd %xmm1, %r9d 2765; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2766; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2767; ALL-NEXT: vmovd %xmm1, %r10d 2768; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 2769; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2770; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2771; ALL-NEXT: vmovd %xmm2, %r11d 2772; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2773; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2774; ALL-NEXT: vmovd %xmm2, %eax 2775; ALL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2776; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2777; ALL-NEXT: vmovd %xmm2, %ecx 2778; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2779; ALL-NEXT: vmovd %xmm0, %edx 2780; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2781; ALL-NEXT: vmovd %xmm0, %esi 2782; ALL-NEXT: movw %si, 8(%rdi) 2783; ALL-NEXT: movw %dx, (%rdi) 2784; ALL-NEXT: movw %cx, 14(%rdi) 2785; ALL-NEXT: movw %ax, 12(%rdi) 2786; ALL-NEXT: movw %r11w, 10(%rdi) 2787; ALL-NEXT: movw %r10w, 6(%rdi) 2788; ALL-NEXT: movw %r9w, 4(%rdi) 2789; ALL-NEXT: movw %r8w, 2(%rdi) 2790; ALL-NEXT: vzeroupper 2791; ALL-NEXT: retq 2792 %1 = fptrunc <8 x float> %a0 to <8 x half> 2793 %2 = bitcast <8 x half> %1 to <8 x i16> 2794 store <8 x i16> %2, <8 x i16>* %a1 2795 ret void 2796} 2797 2798define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwind { 2799; AVX1-LABEL: store_cvt_16f32_to_16i16: 2800; AVX1: # %bb.0: 2801; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2802; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2803; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2804; AVX1-NEXT: vmovd %xmm4, %eax 2805; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2806; AVX1-NEXT: movw %ax, 24(%rdi) 2807; AVX1-NEXT: vmovd %xmm4, %eax 2808; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2809; AVX1-NEXT: movw %ax, 16(%rdi) 2810; AVX1-NEXT: vmovd %xmm4, %eax 2811; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2812; AVX1-NEXT: movw %ax, 8(%rdi) 2813; AVX1-NEXT: vmovd %xmm4, %eax 2814; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2815; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2816; AVX1-NEXT: movw %ax, (%rdi) 2817; AVX1-NEXT: vmovd %xmm4, %eax 2818; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2819; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2820; AVX1-NEXT: movw %ax, 30(%rdi) 2821; AVX1-NEXT: vmovd %xmm4, %eax 2822; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2823; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2824; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2825; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2826; AVX1-NEXT: movw %ax, 28(%rdi) 2827; AVX1-NEXT: vmovd %xmm3, %eax 2828; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 2829; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2830; AVX1-NEXT: movw %ax, 26(%rdi) 2831; AVX1-NEXT: vmovd %xmm3, %eax 2832; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 2833; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2834; AVX1-NEXT: movw %ax, 22(%rdi) 2835; AVX1-NEXT: vmovd %xmm3, %eax 2836; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2837; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2838; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2839; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2840; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 2841; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2842; AVX1-NEXT: movw %ax, 20(%rdi) 2843; AVX1-NEXT: vmovd %xmm1, %eax 2844; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3] 2845; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2846; AVX1-NEXT: movw %ax, 18(%rdi) 2847; AVX1-NEXT: vmovd %xmm1, %eax 2848; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2849; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2850; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 2851; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2852; AVX1-NEXT: movw %ax, 14(%rdi) 2853; AVX1-NEXT: vmovd %xmm2, %eax 2854; AVX1-NEXT: movw %ax, 12(%rdi) 2855; AVX1-NEXT: vmovd %xmm1, %eax 2856; AVX1-NEXT: movw %ax, 10(%rdi) 2857; AVX1-NEXT: vmovd %xmm0, %eax 2858; AVX1-NEXT: movw %ax, 6(%rdi) 2859; AVX1-NEXT: vmovd %xmm3, %eax 2860; AVX1-NEXT: movw %ax, 4(%rdi) 2861; AVX1-NEXT: vmovd %xmm4, %eax 2862; AVX1-NEXT: movw %ax, 2(%rdi) 2863; AVX1-NEXT: vzeroupper 2864; AVX1-NEXT: retq 2865; 2866; AVX2-LABEL: store_cvt_16f32_to_16i16: 2867; AVX2: # %bb.0: 2868; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 2869; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3 2870; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2871; AVX2-NEXT: vmovd %xmm4, %eax 2872; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2873; AVX2-NEXT: movw %ax, 24(%rdi) 2874; AVX2-NEXT: vmovd %xmm4, %eax 2875; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2876; AVX2-NEXT: movw %ax, 16(%rdi) 2877; AVX2-NEXT: vmovd %xmm4, %eax 2878; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2879; AVX2-NEXT: movw %ax, 8(%rdi) 2880; AVX2-NEXT: vmovd %xmm4, %eax 2881; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2882; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2883; AVX2-NEXT: movw %ax, (%rdi) 2884; AVX2-NEXT: vmovd %xmm4, %eax 2885; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2886; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2887; AVX2-NEXT: movw %ax, 30(%rdi) 2888; AVX2-NEXT: vmovd %xmm4, %eax 2889; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2890; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2891; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2892; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2893; AVX2-NEXT: movw %ax, 28(%rdi) 2894; AVX2-NEXT: vmovd %xmm3, %eax 2895; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 2896; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2897; AVX2-NEXT: movw %ax, 26(%rdi) 2898; AVX2-NEXT: vmovd %xmm3, %eax 2899; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 2900; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2901; AVX2-NEXT: movw %ax, 22(%rdi) 2902; AVX2-NEXT: vmovd %xmm3, %eax 2903; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2904; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2905; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2906; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2907; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 2908; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2909; AVX2-NEXT: movw %ax, 20(%rdi) 2910; AVX2-NEXT: vmovd %xmm1, %eax 2911; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3] 2912; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2913; AVX2-NEXT: movw %ax, 18(%rdi) 2914; AVX2-NEXT: vmovd %xmm1, %eax 2915; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2916; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2917; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 2918; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2919; AVX2-NEXT: movw %ax, 14(%rdi) 2920; AVX2-NEXT: vmovd %xmm2, %eax 2921; AVX2-NEXT: movw %ax, 12(%rdi) 2922; AVX2-NEXT: vmovd %xmm1, %eax 2923; AVX2-NEXT: movw %ax, 10(%rdi) 2924; AVX2-NEXT: vmovd %xmm0, %eax 2925; AVX2-NEXT: movw %ax, 6(%rdi) 2926; AVX2-NEXT: vmovd %xmm3, %eax 2927; AVX2-NEXT: movw %ax, 4(%rdi) 2928; AVX2-NEXT: vmovd %xmm4, %eax 2929; AVX2-NEXT: movw %ax, 2(%rdi) 2930; AVX2-NEXT: vzeroupper 2931; AVX2-NEXT: retq 2932; 2933; AVX512-LABEL: store_cvt_16f32_to_16i16: 2934; AVX512: # %bb.0: 2935; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 2936; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2 2937; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3 2938; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2939; AVX512-NEXT: vmovd %xmm4, %eax 2940; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2941; AVX512-NEXT: movw %ax, 24(%rdi) 2942; AVX512-NEXT: vmovd %xmm4, %eax 2943; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2944; AVX512-NEXT: movw %ax, 16(%rdi) 2945; AVX512-NEXT: vmovd %xmm4, %eax 2946; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2947; AVX512-NEXT: movw %ax, 8(%rdi) 2948; AVX512-NEXT: vmovd %xmm4, %eax 2949; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2950; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2951; AVX512-NEXT: movw %ax, (%rdi) 2952; AVX512-NEXT: vmovd %xmm4, %eax 2953; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2954; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2955; AVX512-NEXT: movw %ax, 30(%rdi) 2956; AVX512-NEXT: vmovd %xmm4, %eax 2957; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2958; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2959; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2960; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2961; AVX512-NEXT: movw %ax, 28(%rdi) 2962; AVX512-NEXT: vmovd %xmm3, %eax 2963; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3] 2964; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2965; AVX512-NEXT: movw %ax, 26(%rdi) 2966; AVX512-NEXT: vmovd %xmm3, %eax 2967; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 2968; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2969; AVX512-NEXT: movw %ax, 22(%rdi) 2970; AVX512-NEXT: vmovd %xmm3, %eax 2971; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2972; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2973; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2974; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2975; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] 2976; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2977; AVX512-NEXT: movw %ax, 20(%rdi) 2978; AVX512-NEXT: vmovd %xmm2, %eax 2979; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2980; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2981; AVX512-NEXT: movw %ax, 18(%rdi) 2982; AVX512-NEXT: vmovd %xmm2, %eax 2983; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2984; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2985; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 2986; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2987; AVX512-NEXT: movw %ax, 14(%rdi) 2988; AVX512-NEXT: vmovd %xmm1, %eax 2989; AVX512-NEXT: movw %ax, 12(%rdi) 2990; AVX512-NEXT: vmovd %xmm2, %eax 2991; AVX512-NEXT: movw %ax, 10(%rdi) 2992; AVX512-NEXT: vmovd %xmm0, %eax 2993; AVX512-NEXT: movw %ax, 6(%rdi) 2994; AVX512-NEXT: vmovd %xmm3, %eax 2995; AVX512-NEXT: movw %ax, 4(%rdi) 2996; AVX512-NEXT: vmovd %xmm4, %eax 2997; AVX512-NEXT: movw %ax, 2(%rdi) 2998; AVX512-NEXT: vzeroupper 2999; AVX512-NEXT: retq 3000 %1 = fptrunc <16 x float> %a0 to <16 x half> 3001 %2 = bitcast <16 x half> %1 to <16 x i16> 3002 store <16 x i16> %2, <16 x i16>* %a1 3003 ret void 3004} 3005 3006; 3007; Double to Half 3008; 3009 3010define i16 @cvt_f64_to_i16(double %a0) nounwind { 3011; ALL-LABEL: cvt_f64_to_i16: 3012; ALL: # %bb.0: 3013; ALL-NEXT: jmp __truncdfhf2 # TAILCALL 3014 %1 = fptrunc double %a0 to half 3015 %2 = bitcast half %1 to i16 3016 ret i16 %2 3017} 3018 3019define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { 3020; ALL-LABEL: cvt_2f64_to_2i16: 3021; ALL: # %bb.0: 3022; ALL-NEXT: pushq %rbx 3023; ALL-NEXT: subq $16, %rsp 3024; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3025; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3026; ALL-NEXT: callq __truncdfhf2 3027; ALL-NEXT: movl %eax, %ebx 3028; ALL-NEXT: shll $16, %ebx 3029; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3030; ALL-NEXT: callq __truncdfhf2 3031; ALL-NEXT: movzwl %ax, %eax 3032; ALL-NEXT: orl %ebx, %eax 3033; ALL-NEXT: vmovd %eax, %xmm0 3034; ALL-NEXT: addq $16, %rsp 3035; ALL-NEXT: popq %rbx 3036; ALL-NEXT: retq 3037 %1 = fptrunc <2 x double> %a0 to <2 x half> 3038 %2 = bitcast <2 x half> %1 to <2 x i16> 3039 ret <2 x i16> %2 3040} 3041 3042define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { 3043; AVX1-LABEL: cvt_4f64_to_4i16: 3044; AVX1: # %bb.0: 3045; AVX1-NEXT: pushq %r14 3046; AVX1-NEXT: pushq %rbx 3047; AVX1-NEXT: subq $40, %rsp 3048; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3049; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3050; AVX1-NEXT: vzeroupper 3051; AVX1-NEXT: callq __truncdfhf2 3052; AVX1-NEXT: movl %eax, %ebx 3053; AVX1-NEXT: shll $16, %ebx 3054; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3055; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3056; AVX1-NEXT: vzeroupper 3057; AVX1-NEXT: callq __truncdfhf2 3058; AVX1-NEXT: movzwl %ax, %r14d 3059; AVX1-NEXT: orl %ebx, %r14d 3060; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3061; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3062; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3063; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3064; AVX1-NEXT: vzeroupper 3065; AVX1-NEXT: callq __truncdfhf2 3066; AVX1-NEXT: movl %eax, %ebx 3067; AVX1-NEXT: shll $16, %ebx 3068; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3069; AVX1-NEXT: callq __truncdfhf2 3070; AVX1-NEXT: movzwl %ax, %eax 3071; AVX1-NEXT: orl %ebx, %eax 3072; AVX1-NEXT: shlq $32, %rax 3073; AVX1-NEXT: orq %r14, %rax 3074; AVX1-NEXT: vmovq %rax, %xmm0 3075; AVX1-NEXT: addq $40, %rsp 3076; AVX1-NEXT: popq %rbx 3077; AVX1-NEXT: popq %r14 3078; AVX1-NEXT: retq 3079; 3080; AVX2-LABEL: cvt_4f64_to_4i16: 3081; AVX2: # %bb.0: 3082; AVX2-NEXT: pushq %r14 3083; AVX2-NEXT: pushq %rbx 3084; AVX2-NEXT: subq $40, %rsp 3085; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3086; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3087; AVX2-NEXT: vzeroupper 3088; AVX2-NEXT: callq __truncdfhf2 3089; AVX2-NEXT: movl %eax, %ebx 3090; AVX2-NEXT: shll $16, %ebx 3091; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3092; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3093; AVX2-NEXT: vzeroupper 3094; AVX2-NEXT: callq __truncdfhf2 3095; AVX2-NEXT: movzwl %ax, %r14d 3096; AVX2-NEXT: orl %ebx, %r14d 3097; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3098; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3099; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3100; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3101; AVX2-NEXT: vzeroupper 3102; AVX2-NEXT: callq __truncdfhf2 3103; AVX2-NEXT: movl %eax, %ebx 3104; AVX2-NEXT: shll $16, %ebx 3105; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3106; AVX2-NEXT: callq __truncdfhf2 3107; AVX2-NEXT: movzwl %ax, %eax 3108; AVX2-NEXT: orl %ebx, %eax 3109; AVX2-NEXT: shlq $32, %rax 3110; AVX2-NEXT: orq %r14, %rax 3111; AVX2-NEXT: vmovq %rax, %xmm0 3112; AVX2-NEXT: addq $40, %rsp 3113; AVX2-NEXT: popq %rbx 3114; AVX2-NEXT: popq %r14 3115; AVX2-NEXT: retq 3116; 3117; AVX512-LABEL: cvt_4f64_to_4i16: 3118; AVX512: # %bb.0: 3119; AVX512-NEXT: pushq %r14 3120; AVX512-NEXT: pushq %rbx 3121; AVX512-NEXT: subq $40, %rsp 3122; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3123; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3124; AVX512-NEXT: vzeroupper 3125; AVX512-NEXT: callq __truncdfhf2 3126; AVX512-NEXT: movl %eax, %ebx 3127; AVX512-NEXT: shll $16, %ebx 3128; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3129; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3130; AVX512-NEXT: vzeroupper 3131; AVX512-NEXT: callq __truncdfhf2 3132; AVX512-NEXT: movzwl %ax, %r14d 3133; AVX512-NEXT: orl %ebx, %r14d 3134; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3135; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3136; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3137; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3138; AVX512-NEXT: vzeroupper 3139; AVX512-NEXT: callq __truncdfhf2 3140; AVX512-NEXT: movl %eax, %ebx 3141; AVX512-NEXT: shll $16, %ebx 3142; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3143; AVX512-NEXT: callq __truncdfhf2 3144; AVX512-NEXT: movzwl %ax, %eax 3145; AVX512-NEXT: orl %ebx, %eax 3146; AVX512-NEXT: shlq $32, %rax 3147; AVX512-NEXT: orq %r14, %rax 3148; AVX512-NEXT: vmovq %rax, %xmm0 3149; AVX512-NEXT: addq $40, %rsp 3150; AVX512-NEXT: popq %rbx 3151; AVX512-NEXT: popq %r14 3152; AVX512-NEXT: retq 3153 %1 = fptrunc <4 x double> %a0 to <4 x half> 3154 %2 = bitcast <4 x half> %1 to <4 x i16> 3155 ret <4 x i16> %2 3156} 3157 3158define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { 3159; AVX1-LABEL: cvt_4f64_to_8i16_undef: 3160; AVX1: # %bb.0: 3161; AVX1-NEXT: pushq %r14 3162; AVX1-NEXT: pushq %rbx 3163; AVX1-NEXT: subq $40, %rsp 3164; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3165; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3166; AVX1-NEXT: vzeroupper 3167; AVX1-NEXT: callq __truncdfhf2 3168; AVX1-NEXT: movl %eax, %ebx 3169; AVX1-NEXT: shll $16, %ebx 3170; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3171; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3172; AVX1-NEXT: vzeroupper 3173; AVX1-NEXT: callq __truncdfhf2 3174; AVX1-NEXT: movzwl %ax, %r14d 3175; AVX1-NEXT: orl %ebx, %r14d 3176; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3177; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3178; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3179; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3180; AVX1-NEXT: vzeroupper 3181; AVX1-NEXT: callq __truncdfhf2 3182; AVX1-NEXT: movl %eax, %ebx 3183; AVX1-NEXT: shll $16, %ebx 3184; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3185; AVX1-NEXT: callq __truncdfhf2 3186; AVX1-NEXT: movzwl %ax, %eax 3187; AVX1-NEXT: orl %ebx, %eax 3188; AVX1-NEXT: shlq $32, %rax 3189; AVX1-NEXT: orq %r14, %rax 3190; AVX1-NEXT: vmovq %rax, %xmm0 3191; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3192; AVX1-NEXT: addq $40, %rsp 3193; AVX1-NEXT: popq %rbx 3194; AVX1-NEXT: popq %r14 3195; AVX1-NEXT: retq 3196; 3197; AVX2-LABEL: cvt_4f64_to_8i16_undef: 3198; AVX2: # %bb.0: 3199; AVX2-NEXT: pushq %r14 3200; AVX2-NEXT: pushq %rbx 3201; AVX2-NEXT: subq $40, %rsp 3202; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3203; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3204; AVX2-NEXT: vzeroupper 3205; AVX2-NEXT: callq __truncdfhf2 3206; AVX2-NEXT: movl %eax, %ebx 3207; AVX2-NEXT: shll $16, %ebx 3208; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3209; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3210; AVX2-NEXT: vzeroupper 3211; AVX2-NEXT: callq __truncdfhf2 3212; AVX2-NEXT: movzwl %ax, %r14d 3213; AVX2-NEXT: orl %ebx, %r14d 3214; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3215; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3216; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3217; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3218; AVX2-NEXT: vzeroupper 3219; AVX2-NEXT: callq __truncdfhf2 3220; AVX2-NEXT: movl %eax, %ebx 3221; AVX2-NEXT: shll $16, %ebx 3222; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3223; AVX2-NEXT: callq __truncdfhf2 3224; AVX2-NEXT: movzwl %ax, %eax 3225; AVX2-NEXT: orl %ebx, %eax 3226; AVX2-NEXT: shlq $32, %rax 3227; AVX2-NEXT: orq %r14, %rax 3228; AVX2-NEXT: vmovq %rax, %xmm0 3229; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3230; AVX2-NEXT: addq $40, %rsp 3231; AVX2-NEXT: popq %rbx 3232; AVX2-NEXT: popq %r14 3233; AVX2-NEXT: retq 3234; 3235; AVX512-LABEL: cvt_4f64_to_8i16_undef: 3236; AVX512: # %bb.0: 3237; AVX512-NEXT: pushq %r14 3238; AVX512-NEXT: pushq %rbx 3239; AVX512-NEXT: subq $40, %rsp 3240; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3241; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3242; AVX512-NEXT: vzeroupper 3243; AVX512-NEXT: callq __truncdfhf2 3244; AVX512-NEXT: movl %eax, %ebx 3245; AVX512-NEXT: shll $16, %ebx 3246; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3247; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3248; AVX512-NEXT: vzeroupper 3249; AVX512-NEXT: callq __truncdfhf2 3250; AVX512-NEXT: movzwl %ax, %r14d 3251; AVX512-NEXT: orl %ebx, %r14d 3252; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3253; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3254; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3255; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3256; AVX512-NEXT: vzeroupper 3257; AVX512-NEXT: callq __truncdfhf2 3258; AVX512-NEXT: movl %eax, %ebx 3259; AVX512-NEXT: shll $16, %ebx 3260; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3261; AVX512-NEXT: callq __truncdfhf2 3262; AVX512-NEXT: movzwl %ax, %eax 3263; AVX512-NEXT: orl %ebx, %eax 3264; AVX512-NEXT: shlq $32, %rax 3265; AVX512-NEXT: orq %r14, %rax 3266; AVX512-NEXT: vmovq %rax, %xmm0 3267; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3268; AVX512-NEXT: addq $40, %rsp 3269; AVX512-NEXT: popq %rbx 3270; AVX512-NEXT: popq %r14 3271; AVX512-NEXT: retq 3272 %1 = fptrunc <4 x double> %a0 to <4 x half> 3273 %2 = bitcast <4 x half> %1 to <4 x i16> 3274 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3275 ret <8 x i16> %3 3276} 3277 3278define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { 3279; AVX1-LABEL: cvt_4f64_to_8i16_zero: 3280; AVX1: # %bb.0: 3281; AVX1-NEXT: pushq %r14 3282; AVX1-NEXT: pushq %rbx 3283; AVX1-NEXT: subq $40, %rsp 3284; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3285; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3286; AVX1-NEXT: vzeroupper 3287; AVX1-NEXT: callq __truncdfhf2 3288; AVX1-NEXT: movl %eax, %ebx 3289; AVX1-NEXT: shll $16, %ebx 3290; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3291; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3292; AVX1-NEXT: vzeroupper 3293; AVX1-NEXT: callq __truncdfhf2 3294; AVX1-NEXT: movzwl %ax, %r14d 3295; AVX1-NEXT: orl %ebx, %r14d 3296; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3297; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3298; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3299; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3300; AVX1-NEXT: vzeroupper 3301; AVX1-NEXT: callq __truncdfhf2 3302; AVX1-NEXT: movl %eax, %ebx 3303; AVX1-NEXT: shll $16, %ebx 3304; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3305; AVX1-NEXT: callq __truncdfhf2 3306; AVX1-NEXT: movzwl %ax, %eax 3307; AVX1-NEXT: orl %ebx, %eax 3308; AVX1-NEXT: shlq $32, %rax 3309; AVX1-NEXT: orq %r14, %rax 3310; AVX1-NEXT: vmovq %rax, %xmm0 3311; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3312; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 3313; AVX1-NEXT: addq $40, %rsp 3314; AVX1-NEXT: popq %rbx 3315; AVX1-NEXT: popq %r14 3316; AVX1-NEXT: retq 3317; 3318; AVX2-SLOW-LABEL: cvt_4f64_to_8i16_zero: 3319; AVX2-SLOW: # %bb.0: 3320; AVX2-SLOW-NEXT: pushq %r14 3321; AVX2-SLOW-NEXT: pushq %rbx 3322; AVX2-SLOW-NEXT: subq $40, %rsp 3323; AVX2-SLOW-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3324; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3325; AVX2-SLOW-NEXT: vzeroupper 3326; AVX2-SLOW-NEXT: callq __truncdfhf2 3327; AVX2-SLOW-NEXT: movl %eax, %ebx 3328; AVX2-SLOW-NEXT: shll $16, %ebx 3329; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3330; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3331; AVX2-SLOW-NEXT: vzeroupper 3332; AVX2-SLOW-NEXT: callq __truncdfhf2 3333; AVX2-SLOW-NEXT: movzwl %ax, %r14d 3334; AVX2-SLOW-NEXT: orl %ebx, %r14d 3335; AVX2-SLOW-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3336; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 3337; AVX2-SLOW-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3338; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3339; AVX2-SLOW-NEXT: vzeroupper 3340; AVX2-SLOW-NEXT: callq __truncdfhf2 3341; AVX2-SLOW-NEXT: movl %eax, %ebx 3342; AVX2-SLOW-NEXT: shll $16, %ebx 3343; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3344; AVX2-SLOW-NEXT: callq __truncdfhf2 3345; AVX2-SLOW-NEXT: movzwl %ax, %eax 3346; AVX2-SLOW-NEXT: orl %ebx, %eax 3347; AVX2-SLOW-NEXT: shlq $32, %rax 3348; AVX2-SLOW-NEXT: orq %r14, %rax 3349; AVX2-SLOW-NEXT: vmovq %rax, %xmm0 3350; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3351; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 3352; AVX2-SLOW-NEXT: addq $40, %rsp 3353; AVX2-SLOW-NEXT: popq %rbx 3354; AVX2-SLOW-NEXT: popq %r14 3355; AVX2-SLOW-NEXT: retq 3356; 3357; AVX2-FAST-LABEL: cvt_4f64_to_8i16_zero: 3358; AVX2-FAST: # %bb.0: 3359; AVX2-FAST-NEXT: pushq %r14 3360; AVX2-FAST-NEXT: pushq %rbx 3361; AVX2-FAST-NEXT: subq $40, %rsp 3362; AVX2-FAST-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3363; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3364; AVX2-FAST-NEXT: vzeroupper 3365; AVX2-FAST-NEXT: callq __truncdfhf2 3366; AVX2-FAST-NEXT: movl %eax, %ebx 3367; AVX2-FAST-NEXT: shll $16, %ebx 3368; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3369; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3370; AVX2-FAST-NEXT: vzeroupper 3371; AVX2-FAST-NEXT: callq __truncdfhf2 3372; AVX2-FAST-NEXT: movzwl %ax, %r14d 3373; AVX2-FAST-NEXT: orl %ebx, %r14d 3374; AVX2-FAST-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3375; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 3376; AVX2-FAST-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3377; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3378; AVX2-FAST-NEXT: vzeroupper 3379; AVX2-FAST-NEXT: callq __truncdfhf2 3380; AVX2-FAST-NEXT: movl %eax, %ebx 3381; AVX2-FAST-NEXT: shll $16, %ebx 3382; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3383; AVX2-FAST-NEXT: callq __truncdfhf2 3384; AVX2-FAST-NEXT: movzwl %ax, %eax 3385; AVX2-FAST-NEXT: orl %ebx, %eax 3386; AVX2-FAST-NEXT: shlq $32, %rax 3387; AVX2-FAST-NEXT: orq %r14, %rax 3388; AVX2-FAST-NEXT: vmovq %rax, %xmm0 3389; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 3390; AVX2-FAST-NEXT: addq $40, %rsp 3391; AVX2-FAST-NEXT: popq %rbx 3392; AVX2-FAST-NEXT: popq %r14 3393; AVX2-FAST-NEXT: retq 3394; 3395; AVX512F-LABEL: cvt_4f64_to_8i16_zero: 3396; AVX512F: # %bb.0: 3397; AVX512F-NEXT: pushq %r14 3398; AVX512F-NEXT: pushq %rbx 3399; AVX512F-NEXT: subq $40, %rsp 3400; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3401; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3402; AVX512F-NEXT: vzeroupper 3403; AVX512F-NEXT: callq __truncdfhf2 3404; AVX512F-NEXT: movl %eax, %ebx 3405; AVX512F-NEXT: shll $16, %ebx 3406; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3407; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3408; AVX512F-NEXT: vzeroupper 3409; AVX512F-NEXT: callq __truncdfhf2 3410; AVX512F-NEXT: movzwl %ax, %r14d 3411; AVX512F-NEXT: orl %ebx, %r14d 3412; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3413; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 3414; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3415; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3416; AVX512F-NEXT: vzeroupper 3417; AVX512F-NEXT: callq __truncdfhf2 3418; AVX512F-NEXT: movl %eax, %ebx 3419; AVX512F-NEXT: shll $16, %ebx 3420; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3421; AVX512F-NEXT: callq __truncdfhf2 3422; AVX512F-NEXT: movzwl %ax, %eax 3423; AVX512F-NEXT: orl %ebx, %eax 3424; AVX512F-NEXT: shlq $32, %rax 3425; AVX512F-NEXT: orq %r14, %rax 3426; AVX512F-NEXT: vmovq %rax, %xmm0 3427; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3428; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 3429; AVX512F-NEXT: addq $40, %rsp 3430; AVX512F-NEXT: popq %rbx 3431; AVX512F-NEXT: popq %r14 3432; AVX512F-NEXT: retq 3433; 3434; AVX512VL-LABEL: cvt_4f64_to_8i16_zero: 3435; AVX512VL: # %bb.0: 3436; AVX512VL-NEXT: pushq %r14 3437; AVX512VL-NEXT: pushq %rbx 3438; AVX512VL-NEXT: subq $40, %rsp 3439; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3440; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3441; AVX512VL-NEXT: vzeroupper 3442; AVX512VL-NEXT: callq __truncdfhf2 3443; AVX512VL-NEXT: movl %eax, %ebx 3444; AVX512VL-NEXT: shll $16, %ebx 3445; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3446; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3447; AVX512VL-NEXT: vzeroupper 3448; AVX512VL-NEXT: callq __truncdfhf2 3449; AVX512VL-NEXT: movzwl %ax, %r14d 3450; AVX512VL-NEXT: orl %ebx, %r14d 3451; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3452; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 3453; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3454; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3455; AVX512VL-NEXT: vzeroupper 3456; AVX512VL-NEXT: callq __truncdfhf2 3457; AVX512VL-NEXT: movl %eax, %ebx 3458; AVX512VL-NEXT: shll $16, %ebx 3459; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3460; AVX512VL-NEXT: callq __truncdfhf2 3461; AVX512VL-NEXT: movzwl %ax, %eax 3462; AVX512VL-NEXT: orl %ebx, %eax 3463; AVX512VL-NEXT: shlq $32, %rax 3464; AVX512VL-NEXT: orq %r14, %rax 3465; AVX512VL-NEXT: vmovq %rax, %xmm0 3466; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 3467; AVX512VL-NEXT: addq $40, %rsp 3468; AVX512VL-NEXT: popq %rbx 3469; AVX512VL-NEXT: popq %r14 3470; AVX512VL-NEXT: retq 3471 %1 = fptrunc <4 x double> %a0 to <4 x half> 3472 %2 = bitcast <4 x half> %1 to <4 x i16> 3473 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3474 ret <8 x i16> %3 3475} 3476 3477define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { 3478; AVX1-LABEL: cvt_8f64_to_8i16: 3479; AVX1: # %bb.0: 3480; AVX1-NEXT: pushq %r15 3481; AVX1-NEXT: pushq %r14 3482; AVX1-NEXT: pushq %rbx 3483; AVX1-NEXT: subq $64, %rsp 3484; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 3485; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3486; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3487; AVX1-NEXT: vzeroupper 3488; AVX1-NEXT: callq __truncdfhf2 3489; AVX1-NEXT: movl %eax, %ebx 3490; AVX1-NEXT: shll $16, %ebx 3491; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3492; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3493; AVX1-NEXT: vzeroupper 3494; AVX1-NEXT: callq __truncdfhf2 3495; AVX1-NEXT: movzwl %ax, %r15d 3496; AVX1-NEXT: orl %ebx, %r15d 3497; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3498; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3499; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3500; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3501; AVX1-NEXT: vzeroupper 3502; AVX1-NEXT: callq __truncdfhf2 3503; AVX1-NEXT: movl %eax, %ebx 3504; AVX1-NEXT: shll $16, %ebx 3505; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3506; AVX1-NEXT: callq __truncdfhf2 3507; AVX1-NEXT: movzwl %ax, %r14d 3508; AVX1-NEXT: orl %ebx, %r14d 3509; AVX1-NEXT: shlq $32, %r14 3510; AVX1-NEXT: orq %r15, %r14 3511; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3512; AVX1-NEXT: # xmm0 = mem[1,0] 3513; AVX1-NEXT: callq __truncdfhf2 3514; AVX1-NEXT: movl %eax, %ebx 3515; AVX1-NEXT: shll $16, %ebx 3516; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3517; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3518; AVX1-NEXT: vzeroupper 3519; AVX1-NEXT: callq __truncdfhf2 3520; AVX1-NEXT: movzwl %ax, %r15d 3521; AVX1-NEXT: orl %ebx, %r15d 3522; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3523; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3524; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3525; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3526; AVX1-NEXT: vzeroupper 3527; AVX1-NEXT: callq __truncdfhf2 3528; AVX1-NEXT: movl %eax, %ebx 3529; AVX1-NEXT: shll $16, %ebx 3530; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3531; AVX1-NEXT: callq __truncdfhf2 3532; AVX1-NEXT: movzwl %ax, %eax 3533; AVX1-NEXT: orl %ebx, %eax 3534; AVX1-NEXT: shlq $32, %rax 3535; AVX1-NEXT: orq %r15, %rax 3536; AVX1-NEXT: vmovq %rax, %xmm0 3537; AVX1-NEXT: vmovq %r14, %xmm1 3538; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 3539; AVX1-NEXT: addq $64, %rsp 3540; AVX1-NEXT: popq %rbx 3541; AVX1-NEXT: popq %r14 3542; AVX1-NEXT: popq %r15 3543; AVX1-NEXT: retq 3544; 3545; AVX2-LABEL: cvt_8f64_to_8i16: 3546; AVX2: # %bb.0: 3547; AVX2-NEXT: pushq %r15 3548; AVX2-NEXT: pushq %r14 3549; AVX2-NEXT: pushq %rbx 3550; AVX2-NEXT: subq $64, %rsp 3551; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 3552; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3553; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3554; AVX2-NEXT: vzeroupper 3555; AVX2-NEXT: callq __truncdfhf2 3556; AVX2-NEXT: movl %eax, %ebx 3557; AVX2-NEXT: shll $16, %ebx 3558; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3559; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3560; AVX2-NEXT: vzeroupper 3561; AVX2-NEXT: callq __truncdfhf2 3562; AVX2-NEXT: movzwl %ax, %r15d 3563; AVX2-NEXT: orl %ebx, %r15d 3564; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3565; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3566; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3567; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3568; AVX2-NEXT: vzeroupper 3569; AVX2-NEXT: callq __truncdfhf2 3570; AVX2-NEXT: movl %eax, %ebx 3571; AVX2-NEXT: shll $16, %ebx 3572; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3573; AVX2-NEXT: callq __truncdfhf2 3574; AVX2-NEXT: movzwl %ax, %r14d 3575; AVX2-NEXT: orl %ebx, %r14d 3576; AVX2-NEXT: shlq $32, %r14 3577; AVX2-NEXT: orq %r15, %r14 3578; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 3579; AVX2-NEXT: # xmm0 = mem[1,0] 3580; AVX2-NEXT: callq __truncdfhf2 3581; AVX2-NEXT: movl %eax, %ebx 3582; AVX2-NEXT: shll $16, %ebx 3583; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3584; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3585; AVX2-NEXT: vzeroupper 3586; AVX2-NEXT: callq __truncdfhf2 3587; AVX2-NEXT: movzwl %ax, %r15d 3588; AVX2-NEXT: orl %ebx, %r15d 3589; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3590; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3591; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3592; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3593; AVX2-NEXT: vzeroupper 3594; AVX2-NEXT: callq __truncdfhf2 3595; AVX2-NEXT: movl %eax, %ebx 3596; AVX2-NEXT: shll $16, %ebx 3597; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3598; AVX2-NEXT: callq __truncdfhf2 3599; AVX2-NEXT: movzwl %ax, %eax 3600; AVX2-NEXT: orl %ebx, %eax 3601; AVX2-NEXT: shlq $32, %rax 3602; AVX2-NEXT: orq %r15, %rax 3603; AVX2-NEXT: vmovq %rax, %xmm0 3604; AVX2-NEXT: vmovq %r14, %xmm1 3605; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 3606; AVX2-NEXT: addq $64, %rsp 3607; AVX2-NEXT: popq %rbx 3608; AVX2-NEXT: popq %r14 3609; AVX2-NEXT: popq %r15 3610; AVX2-NEXT: retq 3611; 3612; AVX512-LABEL: cvt_8f64_to_8i16: 3613; AVX512: # %bb.0: 3614; AVX512-NEXT: pushq %r15 3615; AVX512-NEXT: pushq %r14 3616; AVX512-NEXT: pushq %rbx 3617; AVX512-NEXT: subq $96, %rsp 3618; AVX512-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill 3619; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3620; AVX512-NEXT: vzeroupper 3621; AVX512-NEXT: callq __truncdfhf2 3622; AVX512-NEXT: movl %eax, %ebx 3623; AVX512-NEXT: shll $16, %ebx 3624; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 3625; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3626; AVX512-NEXT: vzeroupper 3627; AVX512-NEXT: callq __truncdfhf2 3628; AVX512-NEXT: movzwl %ax, %r15d 3629; AVX512-NEXT: orl %ebx, %r15d 3630; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload 3631; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3632; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3633; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3634; AVX512-NEXT: vzeroupper 3635; AVX512-NEXT: callq __truncdfhf2 3636; AVX512-NEXT: movl %eax, %ebx 3637; AVX512-NEXT: shll $16, %ebx 3638; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3639; AVX512-NEXT: callq __truncdfhf2 3640; AVX512-NEXT: movzwl %ax, %r14d 3641; AVX512-NEXT: orl %ebx, %r14d 3642; AVX512-NEXT: shlq $32, %r14 3643; AVX512-NEXT: orq %r15, %r14 3644; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload 3645; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 3646; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3647; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3648; AVX512-NEXT: vzeroupper 3649; AVX512-NEXT: callq __truncdfhf2 3650; AVX512-NEXT: movl %eax, %ebx 3651; AVX512-NEXT: shll $16, %ebx 3652; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3653; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3654; AVX512-NEXT: vzeroupper 3655; AVX512-NEXT: callq __truncdfhf2 3656; AVX512-NEXT: movzwl %ax, %r15d 3657; AVX512-NEXT: orl %ebx, %r15d 3658; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3659; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3660; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3661; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3662; AVX512-NEXT: vzeroupper 3663; AVX512-NEXT: callq __truncdfhf2 3664; AVX512-NEXT: movl %eax, %ebx 3665; AVX512-NEXT: shll $16, %ebx 3666; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3667; AVX512-NEXT: callq __truncdfhf2 3668; AVX512-NEXT: movzwl %ax, %eax 3669; AVX512-NEXT: orl %ebx, %eax 3670; AVX512-NEXT: shlq $32, %rax 3671; AVX512-NEXT: orq %r15, %rax 3672; AVX512-NEXT: vmovq %rax, %xmm0 3673; AVX512-NEXT: vmovq %r14, %xmm1 3674; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 3675; AVX512-NEXT: addq $96, %rsp 3676; AVX512-NEXT: popq %rbx 3677; AVX512-NEXT: popq %r14 3678; AVX512-NEXT: popq %r15 3679; AVX512-NEXT: retq 3680 %1 = fptrunc <8 x double> %a0 to <8 x half> 3681 %2 = bitcast <8 x half> %1 to <8 x i16> 3682 ret <8 x i16> %2 3683} 3684 3685; 3686; Double to Half (Store) 3687; 3688 3689define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind { 3690; ALL-LABEL: store_cvt_f64_to_i16: 3691; ALL: # %bb.0: 3692; ALL-NEXT: pushq %rbx 3693; ALL-NEXT: movq %rdi, %rbx 3694; ALL-NEXT: callq __truncdfhf2 3695; ALL-NEXT: movw %ax, (%rbx) 3696; ALL-NEXT: popq %rbx 3697; ALL-NEXT: retq 3698 %1 = fptrunc double %a0 to half 3699 %2 = bitcast half %1 to i16 3700 store i16 %2, i16* %a1 3701 ret void 3702} 3703 3704define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind { 3705; ALL-LABEL: store_cvt_2f64_to_2i16: 3706; ALL: # %bb.0: 3707; ALL-NEXT: pushq %rbp 3708; ALL-NEXT: pushq %rbx 3709; ALL-NEXT: subq $24, %rsp 3710; ALL-NEXT: movq %rdi, %rbx 3711; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3712; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3713; ALL-NEXT: callq __truncdfhf2 3714; ALL-NEXT: movl %eax, %ebp 3715; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3716; ALL-NEXT: callq __truncdfhf2 3717; ALL-NEXT: movw %ax, (%rbx) 3718; ALL-NEXT: movw %bp, 2(%rbx) 3719; ALL-NEXT: addq $24, %rsp 3720; ALL-NEXT: popq %rbx 3721; ALL-NEXT: popq %rbp 3722; ALL-NEXT: retq 3723 %1 = fptrunc <2 x double> %a0 to <2 x half> 3724 %2 = bitcast <2 x half> %1 to <2 x i16> 3725 store <2 x i16> %2, <2 x i16>* %a1 3726 ret void 3727} 3728 3729define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind { 3730; AVX1-LABEL: store_cvt_4f64_to_4i16: 3731; AVX1: # %bb.0: 3732; AVX1-NEXT: pushq %rbp 3733; AVX1-NEXT: pushq %r15 3734; AVX1-NEXT: pushq %r14 3735; AVX1-NEXT: pushq %rbx 3736; AVX1-NEXT: subq $88, %rsp 3737; AVX1-NEXT: movq %rdi, %rbx 3738; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3739; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3740; AVX1-NEXT: vzeroupper 3741; AVX1-NEXT: callq __truncdfhf2 3742; AVX1-NEXT: movl %eax, %r14d 3743; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3744; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3745; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3746; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3747; AVX1-NEXT: vzeroupper 3748; AVX1-NEXT: callq __truncdfhf2 3749; AVX1-NEXT: movl %eax, %r15d 3750; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3751; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3752; AVX1-NEXT: vzeroupper 3753; AVX1-NEXT: callq __truncdfhf2 3754; AVX1-NEXT: movl %eax, %ebp 3755; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3756; AVX1-NEXT: callq __truncdfhf2 3757; AVX1-NEXT: movw %ax, 4(%rbx) 3758; AVX1-NEXT: movw %bp, (%rbx) 3759; AVX1-NEXT: movw %r15w, 6(%rbx) 3760; AVX1-NEXT: movw %r14w, 2(%rbx) 3761; AVX1-NEXT: addq $88, %rsp 3762; AVX1-NEXT: popq %rbx 3763; AVX1-NEXT: popq %r14 3764; AVX1-NEXT: popq %r15 3765; AVX1-NEXT: popq %rbp 3766; AVX1-NEXT: retq 3767; 3768; AVX2-LABEL: store_cvt_4f64_to_4i16: 3769; AVX2: # %bb.0: 3770; AVX2-NEXT: pushq %rbp 3771; AVX2-NEXT: pushq %r15 3772; AVX2-NEXT: pushq %r14 3773; AVX2-NEXT: pushq %rbx 3774; AVX2-NEXT: subq $88, %rsp 3775; AVX2-NEXT: movq %rdi, %rbx 3776; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3777; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3778; AVX2-NEXT: vzeroupper 3779; AVX2-NEXT: callq __truncdfhf2 3780; AVX2-NEXT: movl %eax, %r14d 3781; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3782; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3783; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3784; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3785; AVX2-NEXT: vzeroupper 3786; AVX2-NEXT: callq __truncdfhf2 3787; AVX2-NEXT: movl %eax, %r15d 3788; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3789; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3790; AVX2-NEXT: vzeroupper 3791; AVX2-NEXT: callq __truncdfhf2 3792; AVX2-NEXT: movl %eax, %ebp 3793; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3794; AVX2-NEXT: callq __truncdfhf2 3795; AVX2-NEXT: movw %ax, 4(%rbx) 3796; AVX2-NEXT: movw %bp, (%rbx) 3797; AVX2-NEXT: movw %r15w, 6(%rbx) 3798; AVX2-NEXT: movw %r14w, 2(%rbx) 3799; AVX2-NEXT: addq $88, %rsp 3800; AVX2-NEXT: popq %rbx 3801; AVX2-NEXT: popq %r14 3802; AVX2-NEXT: popq %r15 3803; AVX2-NEXT: popq %rbp 3804; AVX2-NEXT: retq 3805; 3806; AVX512-LABEL: store_cvt_4f64_to_4i16: 3807; AVX512: # %bb.0: 3808; AVX512-NEXT: pushq %rbp 3809; AVX512-NEXT: pushq %r15 3810; AVX512-NEXT: pushq %r14 3811; AVX512-NEXT: pushq %rbx 3812; AVX512-NEXT: subq $88, %rsp 3813; AVX512-NEXT: movq %rdi, %rbx 3814; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3815; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3816; AVX512-NEXT: vzeroupper 3817; AVX512-NEXT: callq __truncdfhf2 3818; AVX512-NEXT: movl %eax, %r14d 3819; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3820; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3821; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3822; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3823; AVX512-NEXT: vzeroupper 3824; AVX512-NEXT: callq __truncdfhf2 3825; AVX512-NEXT: movl %eax, %r15d 3826; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3827; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3828; AVX512-NEXT: vzeroupper 3829; AVX512-NEXT: callq __truncdfhf2 3830; AVX512-NEXT: movl %eax, %ebp 3831; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3832; AVX512-NEXT: callq __truncdfhf2 3833; AVX512-NEXT: movw %ax, 4(%rbx) 3834; AVX512-NEXT: movw %bp, (%rbx) 3835; AVX512-NEXT: movw %r15w, 6(%rbx) 3836; AVX512-NEXT: movw %r14w, 2(%rbx) 3837; AVX512-NEXT: addq $88, %rsp 3838; AVX512-NEXT: popq %rbx 3839; AVX512-NEXT: popq %r14 3840; AVX512-NEXT: popq %r15 3841; AVX512-NEXT: popq %rbp 3842; AVX512-NEXT: retq 3843 %1 = fptrunc <4 x double> %a0 to <4 x half> 3844 %2 = bitcast <4 x half> %1 to <4 x i16> 3845 store <4 x i16> %2, <4 x i16>* %a1 3846 ret void 3847} 3848 3849define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind { 3850; AVX1-LABEL: store_cvt_4f64_to_8i16_undef: 3851; AVX1: # %bb.0: 3852; AVX1-NEXT: pushq %rbp 3853; AVX1-NEXT: pushq %r14 3854; AVX1-NEXT: pushq %rbx 3855; AVX1-NEXT: subq $32, %rsp 3856; AVX1-NEXT: movq %rdi, %r14 3857; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3858; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3859; AVX1-NEXT: vzeroupper 3860; AVX1-NEXT: callq __truncdfhf2 3861; AVX1-NEXT: movl %eax, %ebp 3862; AVX1-NEXT: shll $16, %ebp 3863; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3864; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3865; AVX1-NEXT: vzeroupper 3866; AVX1-NEXT: callq __truncdfhf2 3867; AVX1-NEXT: movzwl %ax, %ebx 3868; AVX1-NEXT: orl %ebp, %ebx 3869; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3870; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3871; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3872; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3873; AVX1-NEXT: vzeroupper 3874; AVX1-NEXT: callq __truncdfhf2 3875; AVX1-NEXT: movl %eax, %ebp 3876; AVX1-NEXT: shll $16, %ebp 3877; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3878; AVX1-NEXT: callq __truncdfhf2 3879; AVX1-NEXT: movzwl %ax, %eax 3880; AVX1-NEXT: orl %ebp, %eax 3881; AVX1-NEXT: shlq $32, %rax 3882; AVX1-NEXT: orq %rbx, %rax 3883; AVX1-NEXT: vmovq %rax, %xmm0 3884; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3885; AVX1-NEXT: vmovdqa %xmm0, (%r14) 3886; AVX1-NEXT: addq $32, %rsp 3887; AVX1-NEXT: popq %rbx 3888; AVX1-NEXT: popq %r14 3889; AVX1-NEXT: popq %rbp 3890; AVX1-NEXT: retq 3891; 3892; AVX2-LABEL: store_cvt_4f64_to_8i16_undef: 3893; AVX2: # %bb.0: 3894; AVX2-NEXT: pushq %rbp 3895; AVX2-NEXT: pushq %r14 3896; AVX2-NEXT: pushq %rbx 3897; AVX2-NEXT: subq $32, %rsp 3898; AVX2-NEXT: movq %rdi, %r14 3899; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3900; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3901; AVX2-NEXT: vzeroupper 3902; AVX2-NEXT: callq __truncdfhf2 3903; AVX2-NEXT: movl %eax, %ebp 3904; AVX2-NEXT: shll $16, %ebp 3905; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3906; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3907; AVX2-NEXT: vzeroupper 3908; AVX2-NEXT: callq __truncdfhf2 3909; AVX2-NEXT: movzwl %ax, %ebx 3910; AVX2-NEXT: orl %ebp, %ebx 3911; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3912; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3913; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3914; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3915; AVX2-NEXT: vzeroupper 3916; AVX2-NEXT: callq __truncdfhf2 3917; AVX2-NEXT: movl %eax, %ebp 3918; AVX2-NEXT: shll $16, %ebp 3919; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3920; AVX2-NEXT: callq __truncdfhf2 3921; AVX2-NEXT: movzwl %ax, %eax 3922; AVX2-NEXT: orl %ebp, %eax 3923; AVX2-NEXT: shlq $32, %rax 3924; AVX2-NEXT: orq %rbx, %rax 3925; AVX2-NEXT: vmovq %rax, %xmm0 3926; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3927; AVX2-NEXT: vmovdqa %xmm0, (%r14) 3928; AVX2-NEXT: addq $32, %rsp 3929; AVX2-NEXT: popq %rbx 3930; AVX2-NEXT: popq %r14 3931; AVX2-NEXT: popq %rbp 3932; AVX2-NEXT: retq 3933; 3934; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: 3935; AVX512: # %bb.0: 3936; AVX512-NEXT: pushq %rbp 3937; AVX512-NEXT: pushq %r14 3938; AVX512-NEXT: pushq %rbx 3939; AVX512-NEXT: subq $32, %rsp 3940; AVX512-NEXT: movq %rdi, %r14 3941; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3942; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3943; AVX512-NEXT: vzeroupper 3944; AVX512-NEXT: callq __truncdfhf2 3945; AVX512-NEXT: movl %eax, %ebp 3946; AVX512-NEXT: shll $16, %ebp 3947; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3948; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3949; AVX512-NEXT: vzeroupper 3950; AVX512-NEXT: callq __truncdfhf2 3951; AVX512-NEXT: movzwl %ax, %ebx 3952; AVX512-NEXT: orl %ebp, %ebx 3953; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3954; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3955; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3956; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3957; AVX512-NEXT: vzeroupper 3958; AVX512-NEXT: callq __truncdfhf2 3959; AVX512-NEXT: movl %eax, %ebp 3960; AVX512-NEXT: shll $16, %ebp 3961; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3962; AVX512-NEXT: callq __truncdfhf2 3963; AVX512-NEXT: movzwl %ax, %eax 3964; AVX512-NEXT: orl %ebp, %eax 3965; AVX512-NEXT: shlq $32, %rax 3966; AVX512-NEXT: orq %rbx, %rax 3967; AVX512-NEXT: vmovq %rax, %xmm0 3968; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 3969; AVX512-NEXT: vmovdqa %xmm0, (%r14) 3970; AVX512-NEXT: addq $32, %rsp 3971; AVX512-NEXT: popq %rbx 3972; AVX512-NEXT: popq %r14 3973; AVX512-NEXT: popq %rbp 3974; AVX512-NEXT: retq 3975 %1 = fptrunc <4 x double> %a0 to <4 x half> 3976 %2 = bitcast <4 x half> %1 to <4 x i16> 3977 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3978 store <8 x i16> %3, <8 x i16>* %a1 3979 ret void 3980} 3981 3982define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind { 3983; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: 3984; AVX1: # %bb.0: 3985; AVX1-NEXT: pushq %rbp 3986; AVX1-NEXT: pushq %r14 3987; AVX1-NEXT: pushq %rbx 3988; AVX1-NEXT: subq $32, %rsp 3989; AVX1-NEXT: movq %rdi, %r14 3990; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3991; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3992; AVX1-NEXT: vzeroupper 3993; AVX1-NEXT: callq __truncdfhf2 3994; AVX1-NEXT: movl %eax, %ebp 3995; AVX1-NEXT: shll $16, %ebp 3996; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3997; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3998; AVX1-NEXT: vzeroupper 3999; AVX1-NEXT: callq __truncdfhf2 4000; AVX1-NEXT: movzwl %ax, %ebx 4001; AVX1-NEXT: orl %ebp, %ebx 4002; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4003; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4004; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4005; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4006; AVX1-NEXT: vzeroupper 4007; AVX1-NEXT: callq __truncdfhf2 4008; AVX1-NEXT: movl %eax, %ebp 4009; AVX1-NEXT: shll $16, %ebp 4010; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4011; AVX1-NEXT: callq __truncdfhf2 4012; AVX1-NEXT: movzwl %ax, %eax 4013; AVX1-NEXT: orl %ebp, %eax 4014; AVX1-NEXT: shlq $32, %rax 4015; AVX1-NEXT: orq %rbx, %rax 4016; AVX1-NEXT: vmovq %rax, %xmm0 4017; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 4018; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4019; AVX1-NEXT: vmovdqa %xmm0, (%r14) 4020; AVX1-NEXT: addq $32, %rsp 4021; AVX1-NEXT: popq %rbx 4022; AVX1-NEXT: popq %r14 4023; AVX1-NEXT: popq %rbp 4024; AVX1-NEXT: retq 4025; 4026; AVX2-SLOW-LABEL: store_cvt_4f64_to_8i16_zero: 4027; AVX2-SLOW: # %bb.0: 4028; AVX2-SLOW-NEXT: pushq %rbp 4029; AVX2-SLOW-NEXT: pushq %r14 4030; AVX2-SLOW-NEXT: pushq %rbx 4031; AVX2-SLOW-NEXT: subq $32, %rsp 4032; AVX2-SLOW-NEXT: movq %rdi, %r14 4033; AVX2-SLOW-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4034; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4035; AVX2-SLOW-NEXT: vzeroupper 4036; AVX2-SLOW-NEXT: callq __truncdfhf2 4037; AVX2-SLOW-NEXT: movl %eax, %ebp 4038; AVX2-SLOW-NEXT: shll $16, %ebp 4039; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4040; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4041; AVX2-SLOW-NEXT: vzeroupper 4042; AVX2-SLOW-NEXT: callq __truncdfhf2 4043; AVX2-SLOW-NEXT: movzwl %ax, %ebx 4044; AVX2-SLOW-NEXT: orl %ebp, %ebx 4045; AVX2-SLOW-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4046; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 4047; AVX2-SLOW-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4048; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4049; AVX2-SLOW-NEXT: vzeroupper 4050; AVX2-SLOW-NEXT: callq __truncdfhf2 4051; AVX2-SLOW-NEXT: movl %eax, %ebp 4052; AVX2-SLOW-NEXT: shll $16, %ebp 4053; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4054; AVX2-SLOW-NEXT: callq __truncdfhf2 4055; AVX2-SLOW-NEXT: movzwl %ax, %eax 4056; AVX2-SLOW-NEXT: orl %ebp, %eax 4057; AVX2-SLOW-NEXT: shlq $32, %rax 4058; AVX2-SLOW-NEXT: orq %rbx, %rax 4059; AVX2-SLOW-NEXT: vmovq %rax, %xmm0 4060; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 4061; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4062; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r14) 4063; AVX2-SLOW-NEXT: addq $32, %rsp 4064; AVX2-SLOW-NEXT: popq %rbx 4065; AVX2-SLOW-NEXT: popq %r14 4066; AVX2-SLOW-NEXT: popq %rbp 4067; AVX2-SLOW-NEXT: retq 4068; 4069; AVX2-FAST-LABEL: store_cvt_4f64_to_8i16_zero: 4070; AVX2-FAST: # %bb.0: 4071; AVX2-FAST-NEXT: pushq %rbp 4072; AVX2-FAST-NEXT: pushq %r14 4073; AVX2-FAST-NEXT: pushq %rbx 4074; AVX2-FAST-NEXT: subq $32, %rsp 4075; AVX2-FAST-NEXT: movq %rdi, %r14 4076; AVX2-FAST-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4077; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4078; AVX2-FAST-NEXT: vzeroupper 4079; AVX2-FAST-NEXT: callq __truncdfhf2 4080; AVX2-FAST-NEXT: movl %eax, %ebp 4081; AVX2-FAST-NEXT: shll $16, %ebp 4082; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4083; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4084; AVX2-FAST-NEXT: vzeroupper 4085; AVX2-FAST-NEXT: callq __truncdfhf2 4086; AVX2-FAST-NEXT: movzwl %ax, %ebx 4087; AVX2-FAST-NEXT: orl %ebp, %ebx 4088; AVX2-FAST-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4089; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 4090; AVX2-FAST-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4091; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4092; AVX2-FAST-NEXT: vzeroupper 4093; AVX2-FAST-NEXT: callq __truncdfhf2 4094; AVX2-FAST-NEXT: movl %eax, %ebp 4095; AVX2-FAST-NEXT: shll $16, %ebp 4096; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4097; AVX2-FAST-NEXT: callq __truncdfhf2 4098; AVX2-FAST-NEXT: movzwl %ax, %eax 4099; AVX2-FAST-NEXT: orl %ebp, %eax 4100; AVX2-FAST-NEXT: shlq $32, %rax 4101; AVX2-FAST-NEXT: orq %rbx, %rax 4102; AVX2-FAST-NEXT: vmovq %rax, %xmm0 4103; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 4104; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r14) 4105; AVX2-FAST-NEXT: addq $32, %rsp 4106; AVX2-FAST-NEXT: popq %rbx 4107; AVX2-FAST-NEXT: popq %r14 4108; AVX2-FAST-NEXT: popq %rbp 4109; AVX2-FAST-NEXT: retq 4110; 4111; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero: 4112; AVX512F: # %bb.0: 4113; AVX512F-NEXT: pushq %rbp 4114; AVX512F-NEXT: pushq %r14 4115; AVX512F-NEXT: pushq %rbx 4116; AVX512F-NEXT: subq $32, %rsp 4117; AVX512F-NEXT: movq %rdi, %r14 4118; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4119; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4120; AVX512F-NEXT: vzeroupper 4121; AVX512F-NEXT: callq __truncdfhf2 4122; AVX512F-NEXT: movl %eax, %ebp 4123; AVX512F-NEXT: shll $16, %ebp 4124; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4125; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4126; AVX512F-NEXT: vzeroupper 4127; AVX512F-NEXT: callq __truncdfhf2 4128; AVX512F-NEXT: movzwl %ax, %ebx 4129; AVX512F-NEXT: orl %ebp, %ebx 4130; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4131; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 4132; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4133; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4134; AVX512F-NEXT: vzeroupper 4135; AVX512F-NEXT: callq __truncdfhf2 4136; AVX512F-NEXT: movl %eax, %ebp 4137; AVX512F-NEXT: shll $16, %ebp 4138; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4139; AVX512F-NEXT: callq __truncdfhf2 4140; AVX512F-NEXT: movzwl %ax, %eax 4141; AVX512F-NEXT: orl %ebp, %eax 4142; AVX512F-NEXT: shlq $32, %rax 4143; AVX512F-NEXT: orq %rbx, %rax 4144; AVX512F-NEXT: vmovq %rax, %xmm0 4145; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 4146; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 4147; AVX512F-NEXT: vmovdqa %xmm0, (%r14) 4148; AVX512F-NEXT: addq $32, %rsp 4149; AVX512F-NEXT: popq %rbx 4150; AVX512F-NEXT: popq %r14 4151; AVX512F-NEXT: popq %rbp 4152; AVX512F-NEXT: retq 4153; 4154; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero: 4155; AVX512VL: # %bb.0: 4156; AVX512VL-NEXT: pushq %rbp 4157; AVX512VL-NEXT: pushq %r14 4158; AVX512VL-NEXT: pushq %rbx 4159; AVX512VL-NEXT: subq $32, %rsp 4160; AVX512VL-NEXT: movq %rdi, %r14 4161; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 4162; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4163; AVX512VL-NEXT: vzeroupper 4164; AVX512VL-NEXT: callq __truncdfhf2 4165; AVX512VL-NEXT: movl %eax, %ebp 4166; AVX512VL-NEXT: shll $16, %ebp 4167; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 4168; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4169; AVX512VL-NEXT: vzeroupper 4170; AVX512VL-NEXT: callq __truncdfhf2 4171; AVX512VL-NEXT: movzwl %ax, %ebx 4172; AVX512VL-NEXT: orl %ebp, %ebx 4173; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 4174; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 4175; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 4176; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4177; AVX512VL-NEXT: vzeroupper 4178; AVX512VL-NEXT: callq __truncdfhf2 4179; AVX512VL-NEXT: movl %eax, %ebp 4180; AVX512VL-NEXT: shll $16, %ebp 4181; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 4182; AVX512VL-NEXT: callq __truncdfhf2 4183; AVX512VL-NEXT: movzwl %ax, %eax 4184; AVX512VL-NEXT: orl %ebp, %eax 4185; AVX512VL-NEXT: shlq $32, %rax 4186; AVX512VL-NEXT: orq %rbx, %rax 4187; AVX512VL-NEXT: vmovq %rax, %xmm0 4188; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero 4189; AVX512VL-NEXT: vmovdqa %xmm0, (%r14) 4190; AVX512VL-NEXT: addq $32, %rsp 4191; AVX512VL-NEXT: popq %rbx 4192; AVX512VL-NEXT: popq %r14 4193; AVX512VL-NEXT: popq %rbp 4194; AVX512VL-NEXT: retq 4195 %1 = fptrunc <4 x double> %a0 to <4 x half> 4196 %2 = bitcast <4 x half> %1 to <4 x i16> 4197 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4198 store <8 x i16> %3, <8 x i16>* %a1 4199 ret void 4200} 4201 4202define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { 4203; AVX1-LABEL: store_cvt_8f64_to_8i16: 4204; AVX1: # %bb.0: 4205; AVX1-NEXT: pushq %rbp 4206; AVX1-NEXT: pushq %r15 4207; AVX1-NEXT: pushq %r14 4208; AVX1-NEXT: pushq %r13 4209; AVX1-NEXT: pushq %r12 4210; AVX1-NEXT: pushq %rbx 4211; AVX1-NEXT: subq $136, %rsp 4212; AVX1-NEXT: movq %rdi, %rbx 4213; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4214; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4215; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4216; AVX1-NEXT: vzeroupper 4217; AVX1-NEXT: callq __truncdfhf2 4218; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4219; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4220; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4221; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4222; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4223; AVX1-NEXT: vzeroupper 4224; AVX1-NEXT: callq __truncdfhf2 4225; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4226; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4227; AVX1-NEXT: # xmm0 = mem[1,0] 4228; AVX1-NEXT: callq __truncdfhf2 4229; AVX1-NEXT: movl %eax, %r12d 4230; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4231; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 4232; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4233; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4234; AVX1-NEXT: vzeroupper 4235; AVX1-NEXT: callq __truncdfhf2 4236; AVX1-NEXT: movl %eax, %r13d 4237; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4238; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4239; AVX1-NEXT: vzeroupper 4240; AVX1-NEXT: callq __truncdfhf2 4241; AVX1-NEXT: movl %eax, %ebp 4242; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4243; AVX1-NEXT: callq __truncdfhf2 4244; AVX1-NEXT: movl %eax, %r14d 4245; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4246; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4247; AVX1-NEXT: vzeroupper 4248; AVX1-NEXT: callq __truncdfhf2 4249; AVX1-NEXT: movl %eax, %r15d 4250; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4251; AVX1-NEXT: callq __truncdfhf2 4252; AVX1-NEXT: movw %ax, 12(%rbx) 4253; AVX1-NEXT: movw %r15w, 8(%rbx) 4254; AVX1-NEXT: movw %r14w, 4(%rbx) 4255; AVX1-NEXT: movw %bp, (%rbx) 4256; AVX1-NEXT: movw %r13w, 14(%rbx) 4257; AVX1-NEXT: movw %r12w, 10(%rbx) 4258; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4259; AVX1-NEXT: movw %ax, 6(%rbx) 4260; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4261; AVX1-NEXT: movw %ax, 2(%rbx) 4262; AVX1-NEXT: addq $136, %rsp 4263; AVX1-NEXT: popq %rbx 4264; AVX1-NEXT: popq %r12 4265; AVX1-NEXT: popq %r13 4266; AVX1-NEXT: popq %r14 4267; AVX1-NEXT: popq %r15 4268; AVX1-NEXT: popq %rbp 4269; AVX1-NEXT: retq 4270; 4271; AVX2-LABEL: store_cvt_8f64_to_8i16: 4272; AVX2: # %bb.0: 4273; AVX2-NEXT: pushq %rbp 4274; AVX2-NEXT: pushq %r15 4275; AVX2-NEXT: pushq %r14 4276; AVX2-NEXT: pushq %r13 4277; AVX2-NEXT: pushq %r12 4278; AVX2-NEXT: pushq %rbx 4279; AVX2-NEXT: subq $136, %rsp 4280; AVX2-NEXT: movq %rdi, %rbx 4281; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4282; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4283; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4284; AVX2-NEXT: vzeroupper 4285; AVX2-NEXT: callq __truncdfhf2 4286; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4287; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4288; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4289; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4290; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4291; AVX2-NEXT: vzeroupper 4292; AVX2-NEXT: callq __truncdfhf2 4293; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4294; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4295; AVX2-NEXT: # xmm0 = mem[1,0] 4296; AVX2-NEXT: callq __truncdfhf2 4297; AVX2-NEXT: movl %eax, %r12d 4298; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4299; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 4300; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4301; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4302; AVX2-NEXT: vzeroupper 4303; AVX2-NEXT: callq __truncdfhf2 4304; AVX2-NEXT: movl %eax, %r13d 4305; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4306; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4307; AVX2-NEXT: vzeroupper 4308; AVX2-NEXT: callq __truncdfhf2 4309; AVX2-NEXT: movl %eax, %ebp 4310; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4311; AVX2-NEXT: callq __truncdfhf2 4312; AVX2-NEXT: movl %eax, %r14d 4313; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4314; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4315; AVX2-NEXT: vzeroupper 4316; AVX2-NEXT: callq __truncdfhf2 4317; AVX2-NEXT: movl %eax, %r15d 4318; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4319; AVX2-NEXT: callq __truncdfhf2 4320; AVX2-NEXT: movw %ax, 12(%rbx) 4321; AVX2-NEXT: movw %r15w, 8(%rbx) 4322; AVX2-NEXT: movw %r14w, 4(%rbx) 4323; AVX2-NEXT: movw %bp, (%rbx) 4324; AVX2-NEXT: movw %r13w, 14(%rbx) 4325; AVX2-NEXT: movw %r12w, 10(%rbx) 4326; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4327; AVX2-NEXT: movw %ax, 6(%rbx) 4328; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4329; AVX2-NEXT: movw %ax, 2(%rbx) 4330; AVX2-NEXT: addq $136, %rsp 4331; AVX2-NEXT: popq %rbx 4332; AVX2-NEXT: popq %r12 4333; AVX2-NEXT: popq %r13 4334; AVX2-NEXT: popq %r14 4335; AVX2-NEXT: popq %r15 4336; AVX2-NEXT: popq %rbp 4337; AVX2-NEXT: retq 4338; 4339; AVX512-LABEL: store_cvt_8f64_to_8i16: 4340; AVX512: # %bb.0: 4341; AVX512-NEXT: pushq %rbp 4342; AVX512-NEXT: pushq %r15 4343; AVX512-NEXT: pushq %r14 4344; AVX512-NEXT: pushq %r13 4345; AVX512-NEXT: pushq %r12 4346; AVX512-NEXT: pushq %rbx 4347; AVX512-NEXT: subq $200, %rsp 4348; AVX512-NEXT: movq %rdi, %rbx 4349; AVX512-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 4350; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4351; AVX512-NEXT: vzeroupper 4352; AVX512-NEXT: callq __truncdfhf2 4353; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4354; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4355; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4356; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4357; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4358; AVX512-NEXT: vzeroupper 4359; AVX512-NEXT: callq __truncdfhf2 4360; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4361; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4362; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 4363; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4364; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4365; AVX512-NEXT: vzeroupper 4366; AVX512-NEXT: callq __truncdfhf2 4367; AVX512-NEXT: movl %eax, %r12d 4368; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4369; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 4370; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4371; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 4372; AVX512-NEXT: vzeroupper 4373; AVX512-NEXT: callq __truncdfhf2 4374; AVX512-NEXT: movl %eax, %r13d 4375; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 4376; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 4377; AVX512-NEXT: vzeroupper 4378; AVX512-NEXT: callq __truncdfhf2 4379; AVX512-NEXT: movl %eax, %ebp 4380; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4381; AVX512-NEXT: callq __truncdfhf2 4382; AVX512-NEXT: movl %eax, %r14d 4383; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4384; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4385; AVX512-NEXT: vzeroupper 4386; AVX512-NEXT: callq __truncdfhf2 4387; AVX512-NEXT: movl %eax, %r15d 4388; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4389; AVX512-NEXT: callq __truncdfhf2 4390; AVX512-NEXT: movw %ax, 12(%rbx) 4391; AVX512-NEXT: movw %r15w, 8(%rbx) 4392; AVX512-NEXT: movw %r14w, 4(%rbx) 4393; AVX512-NEXT: movw %bp, (%rbx) 4394; AVX512-NEXT: movw %r13w, 14(%rbx) 4395; AVX512-NEXT: movw %r12w, 10(%rbx) 4396; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4397; AVX512-NEXT: movw %ax, 6(%rbx) 4398; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 4399; AVX512-NEXT: movw %ax, 2(%rbx) 4400; AVX512-NEXT: addq $200, %rsp 4401; AVX512-NEXT: popq %rbx 4402; AVX512-NEXT: popq %r12 4403; AVX512-NEXT: popq %r13 4404; AVX512-NEXT: popq %r14 4405; AVX512-NEXT: popq %r15 4406; AVX512-NEXT: popq %rbp 4407; AVX512-NEXT: retq 4408 %1 = fptrunc <8 x double> %a0 to <8 x half> 4409 %2 = bitcast <8 x half> %1 to <8 x i16> 4410 store <8 x i16> %2, <8 x i16>* %a1 4411 ret void 4412} 4413