1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 5 6; 7; Half to Float 8; 9 10define float @cvt_i16_to_f32(i16 %a0) { 11; ALL-LABEL: cvt_i16_to_f32: 12; ALL: # BB#0: 13; ALL-NEXT: movswl %di, %eax 14; ALL-NEXT: vmovd %eax, %xmm0 15; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 16; ALL-NEXT: retq 17 %1 = bitcast i16 %a0 to half 18 %2 = fpext half %1 to float 19 ret float %2 20} 21 22define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) { 23; ALL-LABEL: cvt_4i16_to_4f32: 24; ALL: # BB#0: 25; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 26; ALL-NEXT: vmovq %xmm0, %rax 27; ALL-NEXT: movq %rax, %rcx 28; ALL-NEXT: movq %rax, %rdx 29; ALL-NEXT: movswl %ax, %esi 30; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 31; ALL-NEXT: shrl $16, %eax 32; ALL-NEXT: shrq $32, %rcx 33; ALL-NEXT: shrq $48, %rdx 34; ALL-NEXT: movswl %dx, %edx 35; ALL-NEXT: vmovd %edx, %xmm0 36; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 37; ALL-NEXT: movswl %cx, %ecx 38; ALL-NEXT: vmovd %ecx, %xmm1 39; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 40; ALL-NEXT: cwtl 41; ALL-NEXT: vmovd %eax, %xmm2 42; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 43; ALL-NEXT: vmovd %esi, %xmm3 44; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 45; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 46; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 47; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 48; ALL-NEXT: retq 49 %1 = bitcast <4 x i16> %a0 to <4 x half> 50 %2 = fpext <4 x half> %1 to <4 x float> 51 ret <4 x float> %2 52} 53 54define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) { 55; ALL-LABEL: cvt_8i16_to_4f32: 56; ALL: # BB#0: 57; ALL-NEXT: vmovq %xmm0, %rax 58; ALL-NEXT: movq %rax, %rcx 59; ALL-NEXT: movq %rax, %rdx 60; ALL-NEXT: movswl %ax, %esi 61; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 62; ALL-NEXT: shrl $16, %eax 63; ALL-NEXT: shrq $32, %rcx 64; ALL-NEXT: shrq $48, %rdx 65; ALL-NEXT: movswl %dx, %edx 66; ALL-NEXT: vmovd %edx, %xmm0 67; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 68; ALL-NEXT: movswl %cx, %ecx 69; ALL-NEXT: vmovd %ecx, %xmm1 70; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 71; ALL-NEXT: cwtl 72; ALL-NEXT: vmovd %eax, %xmm2 73; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 74; ALL-NEXT: vmovd %esi, %xmm3 75; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 76; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 77; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 78; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 79; ALL-NEXT: retq 80 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 81 %2 = bitcast <4 x i16> %1 to <4 x half> 82 %3 = fpext <4 x half> %2 to <4 x float> 83 ret <4 x float> %3 84} 85 86define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) { 87; AVX1-LABEL: cvt_8i16_to_8f32: 88; AVX1: # BB#0: 89; AVX1-NEXT: vpextrq $1, %xmm0, %rdx 90; AVX1-NEXT: movq %rdx, %r8 91; AVX1-NEXT: movq %rdx, %r10 92; AVX1-NEXT: movswl %dx, %r9d 93; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> 94; AVX1-NEXT: shrl $16, %edx 95; AVX1-NEXT: shrq $32, %r8 96; AVX1-NEXT: shrq $48, %r10 97; AVX1-NEXT: vmovq %xmm0, %rdi 98; AVX1-NEXT: movq %rdi, %rax 99; AVX1-NEXT: movq %rdi, %rsi 100; AVX1-NEXT: movswl %di, %ecx 101; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> 102; AVX1-NEXT: shrl $16, %edi 103; AVX1-NEXT: shrq $32, %rax 104; AVX1-NEXT: shrq $48, %rsi 105; AVX1-NEXT: movswl %si, %esi 106; AVX1-NEXT: vmovd %esi, %xmm0 107; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 108; AVX1-NEXT: cwtl 109; AVX1-NEXT: vmovd %eax, %xmm1 110; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 111; AVX1-NEXT: movswl %di, %eax 112; AVX1-NEXT: vmovd %eax, %xmm2 113; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 114; AVX1-NEXT: vmovd %ecx, %xmm3 115; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 116; AVX1-NEXT: movswl %r10w, %eax 117; AVX1-NEXT: vmovd %eax, %xmm4 118; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 119; AVX1-NEXT: movswl %r8w, %eax 120; AVX1-NEXT: vmovd %eax, %xmm5 121; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 122; AVX1-NEXT: movswl %dx, %eax 123; AVX1-NEXT: vmovd %eax, %xmm6 124; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 125; AVX1-NEXT: vmovd %r9d, %xmm7 126; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 127; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] 128; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 129; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 130; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 131; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 132; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 133; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 134; AVX1-NEXT: retq 135; 136; AVX2-LABEL: cvt_8i16_to_8f32: 137; AVX2: # BB#0: 138; AVX2-NEXT: vpextrq $1, %xmm0, %rdx 139; AVX2-NEXT: movq %rdx, %r8 140; AVX2-NEXT: movq %rdx, %r10 141; AVX2-NEXT: movswl %dx, %r9d 142; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> 143; AVX2-NEXT: shrl $16, %edx 144; AVX2-NEXT: shrq $32, %r8 145; AVX2-NEXT: shrq $48, %r10 146; AVX2-NEXT: vmovq %xmm0, %rdi 147; AVX2-NEXT: movq %rdi, %rax 148; AVX2-NEXT: movq %rdi, %rsi 149; AVX2-NEXT: movswl %di, %ecx 150; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> 151; AVX2-NEXT: shrl $16, %edi 152; AVX2-NEXT: shrq $32, %rax 153; AVX2-NEXT: shrq $48, %rsi 154; AVX2-NEXT: movswl %si, %esi 155; AVX2-NEXT: vmovd %esi, %xmm0 156; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 157; AVX2-NEXT: cwtl 158; AVX2-NEXT: vmovd %eax, %xmm1 159; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 160; AVX2-NEXT: movswl %di, %eax 161; AVX2-NEXT: vmovd %eax, %xmm2 162; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 163; AVX2-NEXT: vmovd %ecx, %xmm3 164; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 165; AVX2-NEXT: movswl %r10w, %eax 166; AVX2-NEXT: vmovd %eax, %xmm4 167; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 168; AVX2-NEXT: movswl %r8w, %eax 169; AVX2-NEXT: vmovd %eax, %xmm5 170; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 171; AVX2-NEXT: movswl %dx, %eax 172; AVX2-NEXT: vmovd %eax, %xmm6 173; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 174; AVX2-NEXT: vmovd %r9d, %xmm7 175; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 176; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] 177; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 178; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 179; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 180; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 181; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 182; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 183; AVX2-NEXT: retq 184; 185; AVX512-LABEL: cvt_8i16_to_8f32: 186; AVX512: # BB#0: 187; AVX512-NEXT: vpextrq $1, %xmm0, %rdx 188; AVX512-NEXT: movq %rdx, %r8 189; AVX512-NEXT: movq %rdx, %r10 190; AVX512-NEXT: movswl %dx, %r9d 191; AVX512-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> 192; AVX512-NEXT: shrl $16, %edx 193; AVX512-NEXT: shrq $32, %r8 194; AVX512-NEXT: shrq $48, %r10 195; AVX512-NEXT: vmovq %xmm0, %rdi 196; AVX512-NEXT: movq %rdi, %rax 197; AVX512-NEXT: movq %rdi, %rsi 198; AVX512-NEXT: movswl %di, %ecx 199; AVX512-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> 200; AVX512-NEXT: shrl $16, %edi 201; AVX512-NEXT: shrq $32, %rax 202; AVX512-NEXT: shrq $48, %rsi 203; AVX512-NEXT: movswl %si, %esi 204; AVX512-NEXT: vmovd %esi, %xmm0 205; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 206; AVX512-NEXT: cwtl 207; AVX512-NEXT: vmovd %eax, %xmm1 208; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 209; AVX512-NEXT: movswl %di, %eax 210; AVX512-NEXT: vmovd %eax, %xmm2 211; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 212; AVX512-NEXT: vmovd %ecx, %xmm3 213; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 214; AVX512-NEXT: movswl %r10w, %eax 215; AVX512-NEXT: vmovd %eax, %xmm4 216; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 217; AVX512-NEXT: movswl %r8w, %eax 218; AVX512-NEXT: vmovd %eax, %xmm5 219; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 220; AVX512-NEXT: movswl %dx, %eax 221; AVX512-NEXT: vmovd %eax, %xmm6 222; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 223; AVX512-NEXT: vmovd %r9d, %xmm7 224; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 225; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] 226; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 227; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 228; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 229; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 230; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 231; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 232; AVX512-NEXT: retq 233 %1 = bitcast <8 x i16> %a0 to <8 x half> 234 %2 = fpext <8 x half> %1 to <8 x float> 235 ret <8 x float> %2 236} 237 238define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { 239; AVX1-LABEL: cvt_16i16_to_16f32: 240; AVX1: # BB#0: 241; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 242; AVX1-NEXT: vmovq %xmm4, %rax 243; AVX1-NEXT: movq %rax, %rcx 244; AVX1-NEXT: shrq $48, %rcx 245; AVX1-NEXT: movswl %cx, %ecx 246; AVX1-NEXT: vmovd %ecx, %xmm8 247; AVX1-NEXT: movq %rax, %rcx 248; AVX1-NEXT: shrq $32, %rcx 249; AVX1-NEXT: movswl %cx, %ecx 250; AVX1-NEXT: vmovd %ecx, %xmm9 251; AVX1-NEXT: movswl %ax, %ecx 252; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 253; AVX1-NEXT: shrl $16, %eax 254; AVX1-NEXT: cwtl 255; AVX1-NEXT: vmovd %eax, %xmm10 256; AVX1-NEXT: vpextrq $1, %xmm4, %rax 257; AVX1-NEXT: vmovd %ecx, %xmm11 258; AVX1-NEXT: movq %rax, %rcx 259; AVX1-NEXT: shrq $48, %rcx 260; AVX1-NEXT: movswl %cx, %ecx 261; AVX1-NEXT: vmovd %ecx, %xmm12 262; AVX1-NEXT: movq %rax, %rcx 263; AVX1-NEXT: shrq $32, %rcx 264; AVX1-NEXT: movswl %cx, %ecx 265; AVX1-NEXT: vmovd %ecx, %xmm13 266; AVX1-NEXT: movswl %ax, %ecx 267; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 268; AVX1-NEXT: shrl $16, %eax 269; AVX1-NEXT: cwtl 270; AVX1-NEXT: vmovd %eax, %xmm14 271; AVX1-NEXT: vmovq %xmm0, %rax 272; AVX1-NEXT: vmovd %ecx, %xmm15 273; AVX1-NEXT: movq %rax, %rcx 274; AVX1-NEXT: shrq $48, %rcx 275; AVX1-NEXT: movswl %cx, %ecx 276; AVX1-NEXT: vmovd %ecx, %xmm2 277; AVX1-NEXT: movq %rax, %rcx 278; AVX1-NEXT: shrq $32, %rcx 279; AVX1-NEXT: movswl %cx, %ecx 280; AVX1-NEXT: vmovd %ecx, %xmm3 281; AVX1-NEXT: movswl %ax, %ecx 282; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 283; AVX1-NEXT: shrl $16, %eax 284; AVX1-NEXT: cwtl 285; AVX1-NEXT: vmovd %eax, %xmm4 286; AVX1-NEXT: vpextrq $1, %xmm0, %rax 287; AVX1-NEXT: vmovd %ecx, %xmm0 288; AVX1-NEXT: movq %rax, %rcx 289; AVX1-NEXT: shrq $48, %rcx 290; AVX1-NEXT: movswl %cx, %ecx 291; AVX1-NEXT: vmovd %ecx, %xmm5 292; AVX1-NEXT: movq %rax, %rcx 293; AVX1-NEXT: shrq $32, %rcx 294; AVX1-NEXT: movswl %cx, %ecx 295; AVX1-NEXT: vmovd %ecx, %xmm6 296; AVX1-NEXT: movl %eax, %ecx 297; AVX1-NEXT: shrl $16, %ecx 298; AVX1-NEXT: movswl %cx, %ecx 299; AVX1-NEXT: vmovd %ecx, %xmm7 300; AVX1-NEXT: cwtl 301; AVX1-NEXT: vmovd %eax, %xmm1 302; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8 303; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9 304; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10 305; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11 306; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12 307; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13 308; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14 309; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15 310; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 311; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 312; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 313; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 314; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 315; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 316; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 317; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 318; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] 319; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 320; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 321; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] 322; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] 323; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 324; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 325; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] 326; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 327; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 328; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] 329; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 330; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 331; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 332; AVX1-NEXT: retq 333; 334; AVX2-LABEL: cvt_16i16_to_16f32: 335; AVX2: # BB#0: 336; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 337; AVX2-NEXT: vmovq %xmm4, %rax 338; AVX2-NEXT: movq %rax, %rcx 339; AVX2-NEXT: shrq $48, %rcx 340; AVX2-NEXT: movswl %cx, %ecx 341; AVX2-NEXT: vmovd %ecx, %xmm8 342; AVX2-NEXT: movq %rax, %rcx 343; AVX2-NEXT: shrq $32, %rcx 344; AVX2-NEXT: movswl %cx, %ecx 345; AVX2-NEXT: vmovd %ecx, %xmm9 346; AVX2-NEXT: movswl %ax, %ecx 347; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 348; AVX2-NEXT: shrl $16, %eax 349; AVX2-NEXT: cwtl 350; AVX2-NEXT: vmovd %eax, %xmm10 351; AVX2-NEXT: vpextrq $1, %xmm4, %rax 352; AVX2-NEXT: vmovd %ecx, %xmm11 353; AVX2-NEXT: movq %rax, %rcx 354; AVX2-NEXT: shrq $48, %rcx 355; AVX2-NEXT: movswl %cx, %ecx 356; AVX2-NEXT: vmovd %ecx, %xmm12 357; AVX2-NEXT: movq %rax, %rcx 358; AVX2-NEXT: shrq $32, %rcx 359; AVX2-NEXT: movswl %cx, %ecx 360; AVX2-NEXT: vmovd %ecx, %xmm13 361; AVX2-NEXT: movswl %ax, %ecx 362; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 363; AVX2-NEXT: shrl $16, %eax 364; AVX2-NEXT: cwtl 365; AVX2-NEXT: vmovd %eax, %xmm14 366; AVX2-NEXT: vmovq %xmm0, %rax 367; AVX2-NEXT: vmovd %ecx, %xmm15 368; AVX2-NEXT: movq %rax, %rcx 369; AVX2-NEXT: shrq $48, %rcx 370; AVX2-NEXT: movswl %cx, %ecx 371; AVX2-NEXT: vmovd %ecx, %xmm2 372; AVX2-NEXT: movq %rax, %rcx 373; AVX2-NEXT: shrq $32, %rcx 374; AVX2-NEXT: movswl %cx, %ecx 375; AVX2-NEXT: vmovd %ecx, %xmm3 376; AVX2-NEXT: movswl %ax, %ecx 377; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 378; AVX2-NEXT: shrl $16, %eax 379; AVX2-NEXT: cwtl 380; AVX2-NEXT: vmovd %eax, %xmm4 381; AVX2-NEXT: vpextrq $1, %xmm0, %rax 382; AVX2-NEXT: vmovd %ecx, %xmm0 383; AVX2-NEXT: movq %rax, %rcx 384; AVX2-NEXT: shrq $48, %rcx 385; AVX2-NEXT: movswl %cx, %ecx 386; AVX2-NEXT: vmovd %ecx, %xmm5 387; AVX2-NEXT: movq %rax, %rcx 388; AVX2-NEXT: shrq $32, %rcx 389; AVX2-NEXT: movswl %cx, %ecx 390; AVX2-NEXT: vmovd %ecx, %xmm6 391; AVX2-NEXT: movl %eax, %ecx 392; AVX2-NEXT: shrl $16, %ecx 393; AVX2-NEXT: movswl %cx, %ecx 394; AVX2-NEXT: vmovd %ecx, %xmm7 395; AVX2-NEXT: cwtl 396; AVX2-NEXT: vmovd %eax, %xmm1 397; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8 398; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9 399; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10 400; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11 401; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12 402; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13 403; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14 404; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15 405; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 406; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 407; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 408; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 409; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 410; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 411; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 412; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 413; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] 414; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 415; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 416; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] 417; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] 418; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 419; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 420; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] 421; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 422; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 423; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] 424; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 425; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 426; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 427; AVX2-NEXT: retq 428; 429; AVX512-LABEL: cvt_16i16_to_16f32: 430; AVX512: # BB#0: 431; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10 432; AVX512-NEXT: vmovq %xmm0, %rax 433; AVX512-NEXT: movq %rax, %rcx 434; AVX512-NEXT: shrq $48, %rcx 435; AVX512-NEXT: movswl %cx, %ecx 436; AVX512-NEXT: vmovd %ecx, %xmm8 437; AVX512-NEXT: movq %rax, %rcx 438; AVX512-NEXT: shrq $32, %rcx 439; AVX512-NEXT: movswl %cx, %ecx 440; AVX512-NEXT: vmovd %ecx, %xmm9 441; AVX512-NEXT: movswl %ax, %ecx 442; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 443; AVX512-NEXT: shrl $16, %eax 444; AVX512-NEXT: cwtl 445; AVX512-NEXT: vmovd %eax, %xmm11 446; AVX512-NEXT: vpextrq $1, %xmm0, %rax 447; AVX512-NEXT: vmovd %ecx, %xmm12 448; AVX512-NEXT: movq %rax, %rcx 449; AVX512-NEXT: shrq $48, %rcx 450; AVX512-NEXT: movswl %cx, %ecx 451; AVX512-NEXT: vmovd %ecx, %xmm13 452; AVX512-NEXT: movq %rax, %rcx 453; AVX512-NEXT: shrq $32, %rcx 454; AVX512-NEXT: movswl %cx, %ecx 455; AVX512-NEXT: vmovd %ecx, %xmm14 456; AVX512-NEXT: movswl %ax, %ecx 457; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 458; AVX512-NEXT: shrl $16, %eax 459; AVX512-NEXT: cwtl 460; AVX512-NEXT: vmovd %eax, %xmm15 461; AVX512-NEXT: vmovq %xmm10, %rax 462; AVX512-NEXT: vmovd %ecx, %xmm2 463; AVX512-NEXT: movq %rax, %rcx 464; AVX512-NEXT: shrq $48, %rcx 465; AVX512-NEXT: movswl %cx, %ecx 466; AVX512-NEXT: vmovd %ecx, %xmm3 467; AVX512-NEXT: movq %rax, %rcx 468; AVX512-NEXT: shrq $32, %rcx 469; AVX512-NEXT: movswl %cx, %ecx 470; AVX512-NEXT: vmovd %ecx, %xmm1 471; AVX512-NEXT: movswl %ax, %ecx 472; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 473; AVX512-NEXT: shrl $16, %eax 474; AVX512-NEXT: cwtl 475; AVX512-NEXT: vmovd %eax, %xmm4 476; AVX512-NEXT: vpextrq $1, %xmm10, %rax 477; AVX512-NEXT: vmovd %ecx, %xmm10 478; AVX512-NEXT: movq %rax, %rcx 479; AVX512-NEXT: shrq $48, %rcx 480; AVX512-NEXT: movswl %cx, %ecx 481; AVX512-NEXT: vmovd %ecx, %xmm5 482; AVX512-NEXT: movq %rax, %rcx 483; AVX512-NEXT: shrq $32, %rcx 484; AVX512-NEXT: movswl %cx, %ecx 485; AVX512-NEXT: vmovd %ecx, %xmm6 486; AVX512-NEXT: movl %eax, %ecx 487; AVX512-NEXT: shrl $16, %ecx 488; AVX512-NEXT: movswl %cx, %ecx 489; AVX512-NEXT: vmovd %ecx, %xmm7 490; AVX512-NEXT: cwtl 491; AVX512-NEXT: vmovd %eax, %xmm0 492; AVX512-NEXT: vcvtph2ps %xmm8, %xmm8 493; AVX512-NEXT: vcvtph2ps %xmm9, %xmm9 494; AVX512-NEXT: vcvtph2ps %xmm11, %xmm11 495; AVX512-NEXT: vcvtph2ps %xmm12, %xmm12 496; AVX512-NEXT: vcvtph2ps %xmm13, %xmm13 497; AVX512-NEXT: vcvtph2ps %xmm14, %xmm14 498; AVX512-NEXT: vcvtph2ps %xmm15, %xmm15 499; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 500; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 501; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 502; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 503; AVX512-NEXT: vcvtph2ps %xmm10, %xmm10 504; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 505; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 506; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 507; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 508; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3] 509; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3] 510; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0] 511; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3] 512; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3] 513; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] 514; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 515; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3] 516; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] 517; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] 518; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] 519; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 520; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 521; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 522; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 523; AVX512-NEXT: retq 524 %1 = bitcast <16 x i16> %a0 to <16 x half> 525 %2 = fpext <16 x half> %1 to <16 x float> 526 ret <16 x float> %2 527} 528 529; 530; Half to Float (Load) 531; 532 533define float @load_cvt_i16_to_f32(i16* %a0) { 534; ALL-LABEL: load_cvt_i16_to_f32: 535; ALL: # BB#0: 536; ALL-NEXT: movswl (%rdi), %eax 537; ALL-NEXT: vmovd %eax, %xmm0 538; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 539; ALL-NEXT: retq 540 %1 = load i16, i16* %a0 541 %2 = bitcast i16 %1 to half 542 %3 = fpext half %2 to float 543 ret float %3 544} 545 546define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) { 547; ALL-LABEL: load_cvt_4i16_to_4f32: 548; ALL: # BB#0: 549; ALL-NEXT: movswl 6(%rdi), %eax 550; ALL-NEXT: vmovd %eax, %xmm0 551; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 552; ALL-NEXT: movswl 4(%rdi), %eax 553; ALL-NEXT: vmovd %eax, %xmm1 554; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 555; ALL-NEXT: movswl (%rdi), %eax 556; ALL-NEXT: vmovd %eax, %xmm2 557; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 558; ALL-NEXT: movswl 2(%rdi), %eax 559; ALL-NEXT: vmovd %eax, %xmm3 560; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 561; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 562; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 563; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 564; ALL-NEXT: retq 565 %1 = load <4 x i16>, <4 x i16>* %a0 566 %2 = bitcast <4 x i16> %1 to <4 x half> 567 %3 = fpext <4 x half> %2 to <4 x float> 568 ret <4 x float> %3 569} 570 571define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) { 572; ALL-LABEL: load_cvt_8i16_to_4f32: 573; ALL: # BB#0: 574; ALL-NEXT: movq (%rdi), %rax 575; ALL-NEXT: movq %rax, %rcx 576; ALL-NEXT: movq %rax, %rdx 577; ALL-NEXT: movswl %ax, %esi 578; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> 579; ALL-NEXT: shrl $16, %eax 580; ALL-NEXT: shrq $32, %rcx 581; ALL-NEXT: shrq $48, %rdx 582; ALL-NEXT: movswl %dx, %edx 583; ALL-NEXT: vmovd %edx, %xmm0 584; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 585; ALL-NEXT: movswl %cx, %ecx 586; ALL-NEXT: vmovd %ecx, %xmm1 587; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 588; ALL-NEXT: cwtl 589; ALL-NEXT: vmovd %eax, %xmm2 590; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 591; ALL-NEXT: vmovd %esi, %xmm3 592; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 593; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 594; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 595; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 596; ALL-NEXT: retq 597 %1 = load <8 x i16>, <8 x i16>* %a0 598 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 599 %3 = bitcast <4 x i16> %2 to <4 x half> 600 %4 = fpext <4 x half> %3 to <4 x float> 601 ret <4 x float> %4 602} 603 604define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) { 605; AVX1-LABEL: load_cvt_8i16_to_8f32: 606; AVX1: # BB#0: 607; AVX1-NEXT: movswl 6(%rdi), %eax 608; AVX1-NEXT: vmovd %eax, %xmm0 609; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 610; AVX1-NEXT: movswl 4(%rdi), %eax 611; AVX1-NEXT: vmovd %eax, %xmm1 612; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 613; AVX1-NEXT: movswl (%rdi), %eax 614; AVX1-NEXT: vmovd %eax, %xmm2 615; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 616; AVX1-NEXT: movswl 2(%rdi), %eax 617; AVX1-NEXT: vmovd %eax, %xmm3 618; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 619; AVX1-NEXT: movswl 14(%rdi), %eax 620; AVX1-NEXT: vmovd %eax, %xmm4 621; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 622; AVX1-NEXT: movswl 12(%rdi), %eax 623; AVX1-NEXT: vmovd %eax, %xmm5 624; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 625; AVX1-NEXT: movswl 8(%rdi), %eax 626; AVX1-NEXT: vmovd %eax, %xmm6 627; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 628; AVX1-NEXT: movswl 10(%rdi), %eax 629; AVX1-NEXT: vmovd %eax, %xmm7 630; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 631; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 632; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 633; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 634; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 635; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 636; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 637; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 638; AVX1-NEXT: retq 639; 640; AVX2-LABEL: load_cvt_8i16_to_8f32: 641; AVX2: # BB#0: 642; AVX2-NEXT: movswl 6(%rdi), %eax 643; AVX2-NEXT: vmovd %eax, %xmm0 644; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 645; AVX2-NEXT: movswl 4(%rdi), %eax 646; AVX2-NEXT: vmovd %eax, %xmm1 647; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 648; AVX2-NEXT: movswl (%rdi), %eax 649; AVX2-NEXT: vmovd %eax, %xmm2 650; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 651; AVX2-NEXT: movswl 2(%rdi), %eax 652; AVX2-NEXT: vmovd %eax, %xmm3 653; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 654; AVX2-NEXT: movswl 14(%rdi), %eax 655; AVX2-NEXT: vmovd %eax, %xmm4 656; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 657; AVX2-NEXT: movswl 12(%rdi), %eax 658; AVX2-NEXT: vmovd %eax, %xmm5 659; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 660; AVX2-NEXT: movswl 8(%rdi), %eax 661; AVX2-NEXT: vmovd %eax, %xmm6 662; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 663; AVX2-NEXT: movswl 10(%rdi), %eax 664; AVX2-NEXT: vmovd %eax, %xmm7 665; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 666; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 667; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 668; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 669; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 670; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 671; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 672; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 673; AVX2-NEXT: retq 674; 675; AVX512-LABEL: load_cvt_8i16_to_8f32: 676; AVX512: # BB#0: 677; AVX512-NEXT: movswl 6(%rdi), %eax 678; AVX512-NEXT: vmovd %eax, %xmm0 679; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 680; AVX512-NEXT: movswl 4(%rdi), %eax 681; AVX512-NEXT: vmovd %eax, %xmm1 682; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 683; AVX512-NEXT: movswl (%rdi), %eax 684; AVX512-NEXT: vmovd %eax, %xmm2 685; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 686; AVX512-NEXT: movswl 2(%rdi), %eax 687; AVX512-NEXT: vmovd %eax, %xmm3 688; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 689; AVX512-NEXT: movswl 14(%rdi), %eax 690; AVX512-NEXT: vmovd %eax, %xmm4 691; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 692; AVX512-NEXT: movswl 12(%rdi), %eax 693; AVX512-NEXT: vmovd %eax, %xmm5 694; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 695; AVX512-NEXT: movswl 8(%rdi), %eax 696; AVX512-NEXT: vmovd %eax, %xmm6 697; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 698; AVX512-NEXT: movswl 10(%rdi), %eax 699; AVX512-NEXT: vmovd %eax, %xmm7 700; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 701; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 702; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 703; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 704; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 705; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 706; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 707; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 708; AVX512-NEXT: retq 709 %1 = load <8 x i16>, <8 x i16>* %a0 710 %2 = bitcast <8 x i16> %1 to <8 x half> 711 %3 = fpext <8 x half> %2 to <8 x float> 712 ret <8 x float> %3 713} 714 715define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) { 716; AVX1-LABEL: load_cvt_16i16_to_16f32: 717; AVX1: # BB#0: 718; AVX1-NEXT: movswl 22(%rdi), %eax 719; AVX1-NEXT: vmovd %eax, %xmm0 720; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8 721; AVX1-NEXT: movswl 20(%rdi), %eax 722; AVX1-NEXT: vmovd %eax, %xmm0 723; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9 724; AVX1-NEXT: movswl 16(%rdi), %eax 725; AVX1-NEXT: vmovd %eax, %xmm0 726; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10 727; AVX1-NEXT: movswl 18(%rdi), %eax 728; AVX1-NEXT: vmovd %eax, %xmm0 729; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11 730; AVX1-NEXT: movswl 30(%rdi), %eax 731; AVX1-NEXT: vmovd %eax, %xmm0 732; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12 733; AVX1-NEXT: movswl 28(%rdi), %eax 734; AVX1-NEXT: vmovd %eax, %xmm0 735; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13 736; AVX1-NEXT: movswl 24(%rdi), %eax 737; AVX1-NEXT: vmovd %eax, %xmm0 738; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14 739; AVX1-NEXT: movswl 26(%rdi), %eax 740; AVX1-NEXT: vmovd %eax, %xmm0 741; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15 742; AVX1-NEXT: movswl 6(%rdi), %eax 743; AVX1-NEXT: vmovd %eax, %xmm0 744; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 745; AVX1-NEXT: movswl 4(%rdi), %eax 746; AVX1-NEXT: vmovd %eax, %xmm2 747; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 748; AVX1-NEXT: movswl (%rdi), %eax 749; AVX1-NEXT: vmovd %eax, %xmm3 750; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 751; AVX1-NEXT: movswl 2(%rdi), %eax 752; AVX1-NEXT: vmovd %eax, %xmm4 753; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 754; AVX1-NEXT: movswl 14(%rdi), %eax 755; AVX1-NEXT: vmovd %eax, %xmm5 756; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 757; AVX1-NEXT: movswl 12(%rdi), %eax 758; AVX1-NEXT: vmovd %eax, %xmm6 759; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 760; AVX1-NEXT: movswl 8(%rdi), %eax 761; AVX1-NEXT: vmovd %eax, %xmm7 762; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 763; AVX1-NEXT: movswl 10(%rdi), %eax 764; AVX1-NEXT: vmovd %eax, %xmm1 765; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 766; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] 767; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 768; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 769; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] 770; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 771; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 772; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 773; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 774; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 775; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 776; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 777; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 778; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 779; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 780; AVX1-NEXT: retq 781; 782; AVX2-LABEL: load_cvt_16i16_to_16f32: 783; AVX2: # BB#0: 784; AVX2-NEXT: movswl 22(%rdi), %eax 785; AVX2-NEXT: vmovd %eax, %xmm0 786; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8 787; AVX2-NEXT: movswl 20(%rdi), %eax 788; AVX2-NEXT: vmovd %eax, %xmm0 789; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9 790; AVX2-NEXT: movswl 16(%rdi), %eax 791; AVX2-NEXT: vmovd %eax, %xmm0 792; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10 793; AVX2-NEXT: movswl 18(%rdi), %eax 794; AVX2-NEXT: vmovd %eax, %xmm0 795; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11 796; AVX2-NEXT: movswl 30(%rdi), %eax 797; AVX2-NEXT: vmovd %eax, %xmm0 798; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12 799; AVX2-NEXT: movswl 28(%rdi), %eax 800; AVX2-NEXT: vmovd %eax, %xmm0 801; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13 802; AVX2-NEXT: movswl 24(%rdi), %eax 803; AVX2-NEXT: vmovd %eax, %xmm0 804; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14 805; AVX2-NEXT: movswl 26(%rdi), %eax 806; AVX2-NEXT: vmovd %eax, %xmm0 807; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15 808; AVX2-NEXT: movswl 6(%rdi), %eax 809; AVX2-NEXT: vmovd %eax, %xmm0 810; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 811; AVX2-NEXT: movswl 4(%rdi), %eax 812; AVX2-NEXT: vmovd %eax, %xmm2 813; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 814; AVX2-NEXT: movswl (%rdi), %eax 815; AVX2-NEXT: vmovd %eax, %xmm3 816; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 817; AVX2-NEXT: movswl 2(%rdi), %eax 818; AVX2-NEXT: vmovd %eax, %xmm4 819; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 820; AVX2-NEXT: movswl 14(%rdi), %eax 821; AVX2-NEXT: vmovd %eax, %xmm5 822; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 823; AVX2-NEXT: movswl 12(%rdi), %eax 824; AVX2-NEXT: vmovd %eax, %xmm6 825; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 826; AVX2-NEXT: movswl 8(%rdi), %eax 827; AVX2-NEXT: vmovd %eax, %xmm7 828; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 829; AVX2-NEXT: movswl 10(%rdi), %eax 830; AVX2-NEXT: vmovd %eax, %xmm1 831; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 832; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] 833; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] 834; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] 835; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] 836; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 837; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 838; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 839; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 840; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 841; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 842; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 843; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 844; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 845; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 846; AVX2-NEXT: retq 847; 848; AVX512-LABEL: load_cvt_16i16_to_16f32: 849; AVX512: # BB#0: 850; AVX512-NEXT: movswl 6(%rdi), %eax 851; AVX512-NEXT: vmovd %eax, %xmm0 852; AVX512-NEXT: vcvtph2ps %xmm0, %xmm8 853; AVX512-NEXT: movswl 4(%rdi), %eax 854; AVX512-NEXT: vmovd %eax, %xmm0 855; AVX512-NEXT: vcvtph2ps %xmm0, %xmm9 856; AVX512-NEXT: movswl (%rdi), %eax 857; AVX512-NEXT: vmovd %eax, %xmm0 858; AVX512-NEXT: vcvtph2ps %xmm0, %xmm10 859; AVX512-NEXT: movswl 2(%rdi), %eax 860; AVX512-NEXT: vmovd %eax, %xmm0 861; AVX512-NEXT: vcvtph2ps %xmm0, %xmm11 862; AVX512-NEXT: movswl 14(%rdi), %eax 863; AVX512-NEXT: vmovd %eax, %xmm0 864; AVX512-NEXT: vcvtph2ps %xmm0, %xmm12 865; AVX512-NEXT: movswl 12(%rdi), %eax 866; AVX512-NEXT: vmovd %eax, %xmm0 867; AVX512-NEXT: vcvtph2ps %xmm0, %xmm13 868; AVX512-NEXT: movswl 8(%rdi), %eax 869; AVX512-NEXT: vmovd %eax, %xmm0 870; AVX512-NEXT: vcvtph2ps %xmm0, %xmm14 871; AVX512-NEXT: movswl 10(%rdi), %eax 872; AVX512-NEXT: vmovd %eax, %xmm0 873; AVX512-NEXT: vcvtph2ps %xmm0, %xmm15 874; AVX512-NEXT: movswl 22(%rdi), %eax 875; AVX512-NEXT: vmovd %eax, %xmm0 876; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 877; AVX512-NEXT: movswl 20(%rdi), %eax 878; AVX512-NEXT: vmovd %eax, %xmm1 879; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 880; AVX512-NEXT: movswl 16(%rdi), %eax 881; AVX512-NEXT: vmovd %eax, %xmm2 882; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 883; AVX512-NEXT: movswl 18(%rdi), %eax 884; AVX512-NEXT: vmovd %eax, %xmm3 885; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 886; AVX512-NEXT: movswl 30(%rdi), %eax 887; AVX512-NEXT: vmovd %eax, %xmm4 888; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 889; AVX512-NEXT: movswl 28(%rdi), %eax 890; AVX512-NEXT: vmovd %eax, %xmm5 891; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 892; AVX512-NEXT: movswl 24(%rdi), %eax 893; AVX512-NEXT: vmovd %eax, %xmm6 894; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 895; AVX512-NEXT: movswl 26(%rdi), %eax 896; AVX512-NEXT: vmovd %eax, %xmm7 897; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 898; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 899; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] 900; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] 901; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] 902; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 903; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 904; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 905; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] 906; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] 907; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] 908; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] 909; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] 910; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] 911; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 912; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 913; AVX512-NEXT: retq 914 %1 = load <16 x i16>, <16 x i16>* %a0 915 %2 = bitcast <16 x i16> %1 to <16 x half> 916 %3 = fpext <16 x half> %2 to <16 x float> 917 ret <16 x float> %3 918} 919 920; 921; Half to Double 922; 923 924define double @cvt_i16_to_f64(i16 %a0) { 925; ALL-LABEL: cvt_i16_to_f64: 926; ALL: # BB#0: 927; ALL-NEXT: movswl %di, %eax 928; ALL-NEXT: vmovd %eax, %xmm0 929; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 930; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 931; ALL-NEXT: retq 932 %1 = bitcast i16 %a0 to half 933 %2 = fpext half %1 to double 934 ret double %2 935} 936 937define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) { 938; ALL-LABEL: cvt_2i16_to_2f64: 939; ALL: # BB#0: 940; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 941; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 942; ALL-NEXT: vmovd %xmm0, %eax 943; ALL-NEXT: movswl %ax, %ecx 944; ALL-NEXT: shrl $16, %eax 945; ALL-NEXT: cwtl 946; ALL-NEXT: vmovd %eax, %xmm0 947; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 948; ALL-NEXT: vmovd %ecx, %xmm1 949; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 950; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 951; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 952; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 953; ALL-NEXT: retq 954 %1 = bitcast <2 x i16> %a0 to <2 x half> 955 %2 = fpext <2 x half> %1 to <2 x double> 956 ret <2 x double> %2 957} 958 959define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) { 960; ALL-LABEL: cvt_4i16_to_4f64: 961; ALL: # BB#0: 962; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 963; ALL-NEXT: vmovq %xmm0, %rax 964; ALL-NEXT: movq %rax, %rcx 965; ALL-NEXT: movl %eax, %edx 966; ALL-NEXT: movswl %ax, %esi 967; ALL-NEXT: shrq $48, %rax 968; ALL-NEXT: shrq $32, %rcx 969; ALL-NEXT: shrl $16, %edx 970; ALL-NEXT: movswl %dx, %edx 971; ALL-NEXT: vmovd %edx, %xmm0 972; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 973; ALL-NEXT: vmovd %esi, %xmm1 974; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 975; ALL-NEXT: movswl %cx, %ecx 976; ALL-NEXT: vmovd %ecx, %xmm2 977; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 978; ALL-NEXT: cwtl 979; ALL-NEXT: vmovd %eax, %xmm3 980; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 981; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 982; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 983; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 984; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 985; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 986; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 987; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 988; ALL-NEXT: retq 989 %1 = bitcast <4 x i16> %a0 to <4 x half> 990 %2 = fpext <4 x half> %1 to <4 x double> 991 ret <4 x double> %2 992} 993 994define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) { 995; ALL-LABEL: cvt_8i16_to_2f64: 996; ALL: # BB#0: 997; ALL-NEXT: vmovd %xmm0, %eax 998; ALL-NEXT: movswl %ax, %ecx 999; ALL-NEXT: shrl $16, %eax 1000; ALL-NEXT: cwtl 1001; ALL-NEXT: vmovd %eax, %xmm0 1002; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1003; ALL-NEXT: vmovd %ecx, %xmm1 1004; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1005; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1006; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1007; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1008; ALL-NEXT: retq 1009 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 1010 %2 = bitcast <2 x i16> %1 to <2 x half> 1011 %3 = fpext <2 x half> %2 to <2 x double> 1012 ret <2 x double> %3 1013} 1014 1015define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) { 1016; ALL-LABEL: cvt_8i16_to_4f64: 1017; ALL: # BB#0: 1018; ALL-NEXT: vmovq %xmm0, %rax 1019; ALL-NEXT: movq %rax, %rcx 1020; ALL-NEXT: movl %eax, %edx 1021; ALL-NEXT: movswl %ax, %esi 1022; ALL-NEXT: shrq $48, %rax 1023; ALL-NEXT: shrq $32, %rcx 1024; ALL-NEXT: shrl $16, %edx 1025; ALL-NEXT: movswl %dx, %edx 1026; ALL-NEXT: vmovd %edx, %xmm0 1027; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1028; ALL-NEXT: vmovd %esi, %xmm1 1029; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1030; ALL-NEXT: movswl %cx, %ecx 1031; ALL-NEXT: vmovd %ecx, %xmm2 1032; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 1033; ALL-NEXT: cwtl 1034; ALL-NEXT: vmovd %eax, %xmm3 1035; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 1036; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1037; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1038; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1039; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1040; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1041; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1042; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1043; ALL-NEXT: retq 1044 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1045 %2 = bitcast <4 x i16> %1 to <4 x half> 1046 %3 = fpext <4 x half> %2 to <4 x double> 1047 ret <4 x double> %3 1048} 1049 1050define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) { 1051; AVX1-LABEL: cvt_8i16_to_8f64: 1052; AVX1: # BB#0: 1053; AVX1-NEXT: vmovq %xmm0, %rdx 1054; AVX1-NEXT: movq %rdx, %r9 1055; AVX1-NEXT: movl %edx, %r10d 1056; AVX1-NEXT: movswl %dx, %r8d 1057; AVX1-NEXT: shrq $48, %rdx 1058; AVX1-NEXT: shrq $32, %r9 1059; AVX1-NEXT: shrl $16, %r10d 1060; AVX1-NEXT: vpextrq $1, %xmm0, %rdi 1061; AVX1-NEXT: movq %rdi, %rsi 1062; AVX1-NEXT: movl %edi, %eax 1063; AVX1-NEXT: movswl %di, %ecx 1064; AVX1-NEXT: shrq $48, %rdi 1065; AVX1-NEXT: shrq $32, %rsi 1066; AVX1-NEXT: shrl $16, %eax 1067; AVX1-NEXT: cwtl 1068; AVX1-NEXT: vmovd %eax, %xmm0 1069; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 1070; AVX1-NEXT: vmovd %ecx, %xmm0 1071; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 1072; AVX1-NEXT: movswl %si, %eax 1073; AVX1-NEXT: vmovd %eax, %xmm0 1074; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 1075; AVX1-NEXT: movswl %di, %eax 1076; AVX1-NEXT: vmovd %eax, %xmm0 1077; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4 1078; AVX1-NEXT: movswl %r10w, %eax 1079; AVX1-NEXT: vmovd %eax, %xmm0 1080; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1081; AVX1-NEXT: vmovd %r8d, %xmm5 1082; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 1083; AVX1-NEXT: movswl %r9w, %eax 1084; AVX1-NEXT: vmovd %eax, %xmm6 1085; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 1086; AVX1-NEXT: movswl %dx, %eax 1087; AVX1-NEXT: vmovd %eax, %xmm7 1088; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 1089; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1090; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1091; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1092; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1093; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1094; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0] 1095; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1096; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1097; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1098; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1099; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1100; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1101; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1102; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1103; AVX1-NEXT: retq 1104; 1105; AVX2-LABEL: cvt_8i16_to_8f64: 1106; AVX2: # BB#0: 1107; AVX2-NEXT: vmovq %xmm0, %rdx 1108; AVX2-NEXT: movq %rdx, %r9 1109; AVX2-NEXT: movl %edx, %r10d 1110; AVX2-NEXT: movswl %dx, %r8d 1111; AVX2-NEXT: shrq $48, %rdx 1112; AVX2-NEXT: shrq $32, %r9 1113; AVX2-NEXT: shrl $16, %r10d 1114; AVX2-NEXT: vpextrq $1, %xmm0, %rdi 1115; AVX2-NEXT: movq %rdi, %rsi 1116; AVX2-NEXT: movl %edi, %eax 1117; AVX2-NEXT: movswl %di, %ecx 1118; AVX2-NEXT: shrq $48, %rdi 1119; AVX2-NEXT: shrq $32, %rsi 1120; AVX2-NEXT: shrl $16, %eax 1121; AVX2-NEXT: cwtl 1122; AVX2-NEXT: vmovd %eax, %xmm0 1123; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 1124; AVX2-NEXT: vmovd %ecx, %xmm0 1125; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 1126; AVX2-NEXT: movswl %si, %eax 1127; AVX2-NEXT: vmovd %eax, %xmm0 1128; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 1129; AVX2-NEXT: movswl %di, %eax 1130; AVX2-NEXT: vmovd %eax, %xmm0 1131; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4 1132; AVX2-NEXT: movswl %r10w, %eax 1133; AVX2-NEXT: vmovd %eax, %xmm0 1134; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1135; AVX2-NEXT: vmovd %r8d, %xmm5 1136; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 1137; AVX2-NEXT: movswl %r9w, %eax 1138; AVX2-NEXT: vmovd %eax, %xmm6 1139; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 1140; AVX2-NEXT: movswl %dx, %eax 1141; AVX2-NEXT: vmovd %eax, %xmm7 1142; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 1143; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1144; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1145; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1146; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1147; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1148; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0] 1149; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1150; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1151; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1152; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1153; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1154; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1155; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1156; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1157; AVX2-NEXT: retq 1158; 1159; AVX512-LABEL: cvt_8i16_to_8f64: 1160; AVX512: # BB#0: 1161; AVX512-NEXT: vpextrq $1, %xmm0, %rdx 1162; AVX512-NEXT: movq %rdx, %r8 1163; AVX512-NEXT: movl %edx, %r10d 1164; AVX512-NEXT: movswl %dx, %r9d 1165; AVX512-NEXT: shrq $48, %rdx 1166; AVX512-NEXT: shrq $32, %r8 1167; AVX512-NEXT: shrl $16, %r10d 1168; AVX512-NEXT: vmovq %xmm0, %rdi 1169; AVX512-NEXT: movq %rdi, %rax 1170; AVX512-NEXT: movl %edi, %esi 1171; AVX512-NEXT: movswl %di, %ecx 1172; AVX512-NEXT: shrq $48, %rdi 1173; AVX512-NEXT: shrq $32, %rax 1174; AVX512-NEXT: shrl $16, %esi 1175; AVX512-NEXT: movswl %si, %esi 1176; AVX512-NEXT: vmovd %esi, %xmm0 1177; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1178; AVX512-NEXT: vmovd %ecx, %xmm1 1179; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 1180; AVX512-NEXT: cwtl 1181; AVX512-NEXT: vmovd %eax, %xmm2 1182; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1183; AVX512-NEXT: movswl %di, %eax 1184; AVX512-NEXT: vmovd %eax, %xmm3 1185; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1186; AVX512-NEXT: movswl %r10w, %eax 1187; AVX512-NEXT: vmovd %eax, %xmm4 1188; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 1189; AVX512-NEXT: vmovd %r9d, %xmm5 1190; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1191; AVX512-NEXT: movswl %r8w, %eax 1192; AVX512-NEXT: vmovd %eax, %xmm6 1193; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 1194; AVX512-NEXT: movswl %dx, %eax 1195; AVX512-NEXT: vmovd %eax, %xmm7 1196; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 1197; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1198; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1199; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1200; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1201; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1202; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0] 1203; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 1204; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1205; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1206; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1207; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1208; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1209; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1210; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1211; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 1212; AVX512-NEXT: retq 1213 %1 = bitcast <8 x i16> %a0 to <8 x half> 1214 %2 = fpext <8 x half> %1 to <8 x double> 1215 ret <8 x double> %2 1216} 1217 1218; 1219; Half to Double (Load) 1220; 1221 1222define double @load_cvt_i16_to_f64(i16* %a0) { 1223; ALL-LABEL: load_cvt_i16_to_f64: 1224; ALL: # BB#0: 1225; ALL-NEXT: movswl (%rdi), %eax 1226; ALL-NEXT: vmovd %eax, %xmm0 1227; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1228; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1229; ALL-NEXT: retq 1230 %1 = load i16, i16* %a0 1231 %2 = bitcast i16 %1 to half 1232 %3 = fpext half %2 to double 1233 ret double %3 1234} 1235 1236define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) { 1237; ALL-LABEL: load_cvt_2i16_to_2f64: 1238; ALL: # BB#0: 1239; ALL-NEXT: movswl (%rdi), %eax 1240; ALL-NEXT: vmovd %eax, %xmm0 1241; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1242; ALL-NEXT: movswl 2(%rdi), %eax 1243; ALL-NEXT: vmovd %eax, %xmm1 1244; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1245; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1246; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1247; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1248; ALL-NEXT: retq 1249 %1 = load <2 x i16>, <2 x i16>* %a0 1250 %2 = bitcast <2 x i16> %1 to <2 x half> 1251 %3 = fpext <2 x half> %2 to <2 x double> 1252 ret <2 x double> %3 1253} 1254 1255define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) { 1256; ALL-LABEL: load_cvt_4i16_to_4f64: 1257; ALL: # BB#0: 1258; ALL-NEXT: movswl (%rdi), %eax 1259; ALL-NEXT: vmovd %eax, %xmm0 1260; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1261; ALL-NEXT: movswl 2(%rdi), %eax 1262; ALL-NEXT: vmovd %eax, %xmm1 1263; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1264; ALL-NEXT: movswl 4(%rdi), %eax 1265; ALL-NEXT: vmovd %eax, %xmm2 1266; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 1267; ALL-NEXT: movswl 6(%rdi), %eax 1268; ALL-NEXT: vmovd %eax, %xmm3 1269; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 1270; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1271; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1272; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1273; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1274; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1275; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1276; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1277; ALL-NEXT: retq 1278 %1 = load <4 x i16>, <4 x i16>* %a0 1279 %2 = bitcast <4 x i16> %1 to <4 x half> 1280 %3 = fpext <4 x half> %2 to <4 x double> 1281 ret <4 x double> %3 1282} 1283 1284define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) { 1285; ALL-LABEL: load_cvt_8i16_to_4f64: 1286; ALL: # BB#0: 1287; ALL-NEXT: movq (%rdi), %rax 1288; ALL-NEXT: movq %rax, %rcx 1289; ALL-NEXT: movl %eax, %edx 1290; ALL-NEXT: movswl %ax, %esi 1291; ALL-NEXT: shrq $48, %rax 1292; ALL-NEXT: shrq $32, %rcx 1293; ALL-NEXT: shrl $16, %edx 1294; ALL-NEXT: movswl %dx, %edx 1295; ALL-NEXT: vmovd %edx, %xmm0 1296; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 1297; ALL-NEXT: vmovd %esi, %xmm1 1298; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 1299; ALL-NEXT: movswl %cx, %ecx 1300; ALL-NEXT: vmovd %ecx, %xmm2 1301; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 1302; ALL-NEXT: cwtl 1303; ALL-NEXT: vmovd %eax, %xmm3 1304; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 1305; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1306; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1307; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1308; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1309; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1310; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1311; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1312; ALL-NEXT: retq 1313 %1 = load <8 x i16>, <8 x i16>* %a0 1314 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1315 %3 = bitcast <4 x i16> %2 to <4 x half> 1316 %4 = fpext <4 x half> %3 to <4 x double> 1317 ret <4 x double> %4 1318} 1319 1320define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) { 1321; AVX1-LABEL: load_cvt_8i16_to_8f64: 1322; AVX1: # BB#0: 1323; AVX1-NEXT: movswl 8(%rdi), %eax 1324; AVX1-NEXT: vmovd %eax, %xmm0 1325; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 1326; AVX1-NEXT: movswl 10(%rdi), %eax 1327; AVX1-NEXT: vmovd %eax, %xmm0 1328; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 1329; AVX1-NEXT: movswl 12(%rdi), %eax 1330; AVX1-NEXT: vmovd %eax, %xmm0 1331; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 1332; AVX1-NEXT: movswl 14(%rdi), %eax 1333; AVX1-NEXT: vmovd %eax, %xmm0 1334; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4 1335; AVX1-NEXT: movswl (%rdi), %eax 1336; AVX1-NEXT: vmovd %eax, %xmm0 1337; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 1338; AVX1-NEXT: movswl 2(%rdi), %eax 1339; AVX1-NEXT: vmovd %eax, %xmm5 1340; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 1341; AVX1-NEXT: movswl 4(%rdi), %eax 1342; AVX1-NEXT: vmovd %eax, %xmm6 1343; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 1344; AVX1-NEXT: movswl 6(%rdi), %eax 1345; AVX1-NEXT: vmovd %eax, %xmm7 1346; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 1347; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1348; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1349; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1350; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1351; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1352; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] 1353; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1354; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1355; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1356; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1357; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1358; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1359; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1360; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1361; AVX1-NEXT: retq 1362; 1363; AVX2-LABEL: load_cvt_8i16_to_8f64: 1364; AVX2: # BB#0: 1365; AVX2-NEXT: movswl 8(%rdi), %eax 1366; AVX2-NEXT: vmovd %eax, %xmm0 1367; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 1368; AVX2-NEXT: movswl 10(%rdi), %eax 1369; AVX2-NEXT: vmovd %eax, %xmm0 1370; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 1371; AVX2-NEXT: movswl 12(%rdi), %eax 1372; AVX2-NEXT: vmovd %eax, %xmm0 1373; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 1374; AVX2-NEXT: movswl 14(%rdi), %eax 1375; AVX2-NEXT: vmovd %eax, %xmm0 1376; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4 1377; AVX2-NEXT: movswl (%rdi), %eax 1378; AVX2-NEXT: vmovd %eax, %xmm0 1379; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 1380; AVX2-NEXT: movswl 2(%rdi), %eax 1381; AVX2-NEXT: vmovd %eax, %xmm5 1382; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 1383; AVX2-NEXT: movswl 4(%rdi), %eax 1384; AVX2-NEXT: vmovd %eax, %xmm6 1385; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 1386; AVX2-NEXT: movswl 6(%rdi), %eax 1387; AVX2-NEXT: vmovd %eax, %xmm7 1388; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 1389; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1390; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1391; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1392; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1393; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1394; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] 1395; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 1396; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1397; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1398; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 1399; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1400; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1401; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1402; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1403; AVX2-NEXT: retq 1404; 1405; AVX512-LABEL: load_cvt_8i16_to_8f64: 1406; AVX512: # BB#0: 1407; AVX512-NEXT: movswl (%rdi), %eax 1408; AVX512-NEXT: vmovd %eax, %xmm0 1409; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1410; AVX512-NEXT: movswl 2(%rdi), %eax 1411; AVX512-NEXT: vmovd %eax, %xmm1 1412; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 1413; AVX512-NEXT: movswl 4(%rdi), %eax 1414; AVX512-NEXT: vmovd %eax, %xmm2 1415; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1416; AVX512-NEXT: movswl 6(%rdi), %eax 1417; AVX512-NEXT: vmovd %eax, %xmm3 1418; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1419; AVX512-NEXT: movswl 8(%rdi), %eax 1420; AVX512-NEXT: vmovd %eax, %xmm4 1421; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 1422; AVX512-NEXT: movswl 10(%rdi), %eax 1423; AVX512-NEXT: vmovd %eax, %xmm5 1424; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1425; AVX512-NEXT: movswl 12(%rdi), %eax 1426; AVX512-NEXT: vmovd %eax, %xmm6 1427; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 1428; AVX512-NEXT: movswl 14(%rdi), %eax 1429; AVX512-NEXT: vmovd %eax, %xmm7 1430; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 1431; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 1432; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 1433; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 1434; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 1435; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 1436; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0] 1437; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 1438; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 1439; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 1440; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 1441; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 1442; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 1443; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1444; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1445; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 1446; AVX512-NEXT: retq 1447 %1 = load <8 x i16>, <8 x i16>* %a0 1448 %2 = bitcast <8 x i16> %1 to <8 x half> 1449 %3 = fpext <8 x half> %2 to <8 x double> 1450 ret <8 x double> %3 1451} 1452 1453; 1454; Float to Half 1455; 1456 1457define i16 @cvt_f32_to_i16(float %a0) { 1458; ALL-LABEL: cvt_f32_to_i16: 1459; ALL: # BB#0: 1460; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1461; ALL-NEXT: vmovd %xmm0, %eax 1462; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 1463; ALL-NEXT: retq 1464 %1 = fptrunc float %a0 to half 1465 %2 = bitcast half %1 to i16 1466 ret i16 %2 1467} 1468 1469define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) { 1470; ALL-LABEL: cvt_4f32_to_4i16: 1471; ALL: # BB#0: 1472; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1473; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1474; ALL-NEXT: vmovd %xmm1, %eax 1475; ALL-NEXT: shll $16, %eax 1476; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1477; ALL-NEXT: vmovd %xmm1, %ecx 1478; ALL-NEXT: movzwl %cx, %ecx 1479; ALL-NEXT: orl %eax, %ecx 1480; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1481; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1482; ALL-NEXT: vmovd %xmm1, %eax 1483; ALL-NEXT: shll $16, %eax 1484; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1485; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1486; ALL-NEXT: vmovd %xmm0, %edx 1487; ALL-NEXT: movzwl %dx, %edx 1488; ALL-NEXT: orl %eax, %edx 1489; ALL-NEXT: shlq $32, %rdx 1490; ALL-NEXT: orq %rcx, %rdx 1491; ALL-NEXT: vmovq %rdx, %xmm0 1492; ALL-NEXT: retq 1493 %1 = fptrunc <4 x float> %a0 to <4 x half> 1494 %2 = bitcast <4 x half> %1 to <4 x i16> 1495 ret <4 x i16> %2 1496} 1497 1498define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) { 1499; ALL-LABEL: cvt_4f32_to_8i16_undef: 1500; ALL: # BB#0: 1501; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1502; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1503; ALL-NEXT: vmovd %xmm1, %eax 1504; ALL-NEXT: shll $16, %eax 1505; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1506; ALL-NEXT: vmovd %xmm1, %ecx 1507; ALL-NEXT: movzwl %cx, %ecx 1508; ALL-NEXT: orl %eax, %ecx 1509; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1510; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1511; ALL-NEXT: vmovd %xmm1, %eax 1512; ALL-NEXT: shll $16, %eax 1513; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1514; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1515; ALL-NEXT: vmovd %xmm0, %edx 1516; ALL-NEXT: movzwl %dx, %edx 1517; ALL-NEXT: orl %eax, %edx 1518; ALL-NEXT: shlq $32, %rdx 1519; ALL-NEXT: orq %rcx, %rdx 1520; ALL-NEXT: vmovq %rdx, %xmm0 1521; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1522; ALL-NEXT: retq 1523 %1 = fptrunc <4 x float> %a0 to <4 x half> 1524 %2 = bitcast <4 x half> %1 to <4 x i16> 1525 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1526 ret <8 x i16> %3 1527} 1528 1529define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) { 1530; ALL-LABEL: cvt_4f32_to_8i16_zero: 1531; ALL: # BB#0: 1532; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1533; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1534; ALL-NEXT: vmovd %xmm1, %eax 1535; ALL-NEXT: shll $16, %eax 1536; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1537; ALL-NEXT: vmovd %xmm1, %ecx 1538; ALL-NEXT: movzwl %cx, %ecx 1539; ALL-NEXT: orl %eax, %ecx 1540; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1541; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1542; ALL-NEXT: vmovd %xmm1, %eax 1543; ALL-NEXT: shll $16, %eax 1544; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1545; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1546; ALL-NEXT: vmovd %xmm0, %edx 1547; ALL-NEXT: movzwl %dx, %edx 1548; ALL-NEXT: orl %eax, %edx 1549; ALL-NEXT: shlq $32, %rdx 1550; ALL-NEXT: orq %rcx, %rdx 1551; ALL-NEXT: vmovq %rdx, %xmm0 1552; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 1553; ALL-NEXT: retq 1554 %1 = fptrunc <4 x float> %a0 to <4 x half> 1555 %2 = bitcast <4 x half> %1 to <4 x i16> 1556 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1557 ret <8 x i16> %3 1558} 1559 1560define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) { 1561; AVX1-LABEL: cvt_8f32_to_8i16: 1562; AVX1: # BB#0: 1563; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1564; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1565; AVX1-NEXT: vmovd %xmm1, %eax 1566; AVX1-NEXT: shll $16, %eax 1567; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1568; AVX1-NEXT: vmovd %xmm1, %ecx 1569; AVX1-NEXT: movzwl %cx, %ecx 1570; AVX1-NEXT: orl %eax, %ecx 1571; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1572; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1573; AVX1-NEXT: vmovd %xmm1, %edx 1574; AVX1-NEXT: shll $16, %edx 1575; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1576; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1577; AVX1-NEXT: vmovd %xmm1, %eax 1578; AVX1-NEXT: movzwl %ax, %eax 1579; AVX1-NEXT: orl %edx, %eax 1580; AVX1-NEXT: shlq $32, %rax 1581; AVX1-NEXT: orq %rcx, %rax 1582; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1583; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1584; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1585; AVX1-NEXT: vmovd %xmm1, %ecx 1586; AVX1-NEXT: shll $16, %ecx 1587; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1588; AVX1-NEXT: vmovd %xmm1, %edx 1589; AVX1-NEXT: movzwl %dx, %edx 1590; AVX1-NEXT: orl %ecx, %edx 1591; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1592; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1593; AVX1-NEXT: vmovd %xmm1, %ecx 1594; AVX1-NEXT: shll $16, %ecx 1595; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1596; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1597; AVX1-NEXT: vmovd %xmm0, %esi 1598; AVX1-NEXT: movzwl %si, %esi 1599; AVX1-NEXT: orl %ecx, %esi 1600; AVX1-NEXT: shlq $32, %rsi 1601; AVX1-NEXT: orq %rdx, %rsi 1602; AVX1-NEXT: vmovq %rsi, %xmm0 1603; AVX1-NEXT: vmovq %rax, %xmm1 1604; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1605; AVX1-NEXT: vzeroupper 1606; AVX1-NEXT: retq 1607; 1608; AVX2-LABEL: cvt_8f32_to_8i16: 1609; AVX2: # BB#0: 1610; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1611; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1612; AVX2-NEXT: vmovd %xmm1, %eax 1613; AVX2-NEXT: shll $16, %eax 1614; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1615; AVX2-NEXT: vmovd %xmm1, %ecx 1616; AVX2-NEXT: movzwl %cx, %ecx 1617; AVX2-NEXT: orl %eax, %ecx 1618; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1619; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1620; AVX2-NEXT: vmovd %xmm1, %edx 1621; AVX2-NEXT: shll $16, %edx 1622; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1623; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1624; AVX2-NEXT: vmovd %xmm1, %eax 1625; AVX2-NEXT: movzwl %ax, %eax 1626; AVX2-NEXT: orl %edx, %eax 1627; AVX2-NEXT: shlq $32, %rax 1628; AVX2-NEXT: orq %rcx, %rax 1629; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 1630; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1631; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1632; AVX2-NEXT: vmovd %xmm1, %ecx 1633; AVX2-NEXT: shll $16, %ecx 1634; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1635; AVX2-NEXT: vmovd %xmm1, %edx 1636; AVX2-NEXT: movzwl %dx, %edx 1637; AVX2-NEXT: orl %ecx, %edx 1638; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1639; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1640; AVX2-NEXT: vmovd %xmm1, %ecx 1641; AVX2-NEXT: shll $16, %ecx 1642; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1643; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1644; AVX2-NEXT: vmovd %xmm0, %esi 1645; AVX2-NEXT: movzwl %si, %esi 1646; AVX2-NEXT: orl %ecx, %esi 1647; AVX2-NEXT: shlq $32, %rsi 1648; AVX2-NEXT: orq %rdx, %rsi 1649; AVX2-NEXT: vmovq %rsi, %xmm0 1650; AVX2-NEXT: vmovq %rax, %xmm1 1651; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1652; AVX2-NEXT: vzeroupper 1653; AVX2-NEXT: retq 1654; 1655; AVX512-LABEL: cvt_8f32_to_8i16: 1656; AVX512: # BB#0: 1657; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1658; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1659; AVX512-NEXT: vmovd %xmm1, %eax 1660; AVX512-NEXT: shll $16, %eax 1661; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1662; AVX512-NEXT: vmovd %xmm1, %ecx 1663; AVX512-NEXT: movzwl %cx, %ecx 1664; AVX512-NEXT: orl %eax, %ecx 1665; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1666; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1667; AVX512-NEXT: vmovd %xmm1, %edx 1668; AVX512-NEXT: shll $16, %edx 1669; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1670; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1671; AVX512-NEXT: vmovd %xmm1, %eax 1672; AVX512-NEXT: movzwl %ax, %eax 1673; AVX512-NEXT: orl %edx, %eax 1674; AVX512-NEXT: shlq $32, %rax 1675; AVX512-NEXT: orq %rcx, %rax 1676; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1677; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1678; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1679; AVX512-NEXT: vmovd %xmm1, %ecx 1680; AVX512-NEXT: shll $16, %ecx 1681; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1682; AVX512-NEXT: vmovd %xmm1, %edx 1683; AVX512-NEXT: movzwl %dx, %edx 1684; AVX512-NEXT: orl %ecx, %edx 1685; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1686; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1687; AVX512-NEXT: vmovd %xmm1, %ecx 1688; AVX512-NEXT: shll $16, %ecx 1689; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1690; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1691; AVX512-NEXT: vmovd %xmm0, %esi 1692; AVX512-NEXT: movzwl %si, %esi 1693; AVX512-NEXT: orl %ecx, %esi 1694; AVX512-NEXT: shlq $32, %rsi 1695; AVX512-NEXT: orq %rdx, %rsi 1696; AVX512-NEXT: vmovq %rsi, %xmm0 1697; AVX512-NEXT: vmovq %rax, %xmm1 1698; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1699; AVX512-NEXT: retq 1700 %1 = fptrunc <8 x float> %a0 to <8 x half> 1701 %2 = bitcast <8 x half> %1 to <8 x i16> 1702 ret <8 x i16> %2 1703} 1704 1705define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) { 1706; AVX1-LABEL: cvt_16f32_to_16i16: 1707; AVX1: # BB#0: 1708; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm2 1709; AVX1-NEXT: vmovd %xmm2, %eax 1710; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1711; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1712; AVX1-NEXT: vmovd %eax, %xmm3 1713; AVX1-NEXT: vmovd %xmm2, %eax 1714; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1715; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1716; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1717; AVX1-NEXT: vmovd %xmm2, %eax 1718; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1719; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 1720; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1721; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1722; AVX1-NEXT: vmovd %xmm1, %eax 1723; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm1 1724; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1725; AVX1-NEXT: vmovd %xmm1, %eax 1726; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 1727; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1728; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1729; AVX1-NEXT: vmovd %xmm1, %eax 1730; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1731; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1732; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1733; AVX1-NEXT: vmovd %xmm1, %eax 1734; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1735; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 1736; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1737; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 1738; AVX1-NEXT: vmovd %xmm2, %eax 1739; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 1740; AVX1-NEXT: vmovd %xmm1, %eax 1741; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1742; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1743; AVX1-NEXT: vmovd %eax, %xmm3 1744; AVX1-NEXT: vmovd %xmm1, %eax 1745; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1746; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1747; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1748; AVX1-NEXT: vmovd %xmm1, %eax 1749; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1750; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 1751; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1752; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1753; AVX1-NEXT: vmovd %xmm0, %eax 1754; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0 1755; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1756; AVX1-NEXT: vmovd %xmm0, %eax 1757; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1758; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1759; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1760; AVX1-NEXT: vmovd %xmm0, %eax 1761; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 1762; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1763; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1764; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1765; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1766; AVX1-NEXT: vmovd %xmm1, %eax 1767; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 1768; AVX1-NEXT: vmovd %xmm0, %eax 1769; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 1770; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1771; AVX1-NEXT: retq 1772; 1773; AVX2-LABEL: cvt_16f32_to_16i16: 1774; AVX2: # BB#0: 1775; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm2 1776; AVX2-NEXT: vmovd %xmm2, %eax 1777; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1778; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1779; AVX2-NEXT: vmovd %eax, %xmm3 1780; AVX2-NEXT: vmovd %xmm2, %eax 1781; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1782; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1783; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1784; AVX2-NEXT: vmovd %xmm2, %eax 1785; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 1786; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 1787; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1788; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1789; AVX2-NEXT: vmovd %xmm1, %eax 1790; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm1 1791; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1792; AVX2-NEXT: vmovd %xmm1, %eax 1793; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 1794; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1795; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1796; AVX2-NEXT: vmovd %xmm1, %eax 1797; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1798; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1799; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1800; AVX2-NEXT: vmovd %xmm1, %eax 1801; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1802; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 1803; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1804; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 1805; AVX2-NEXT: vmovd %xmm2, %eax 1806; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 1807; AVX2-NEXT: vmovd %xmm1, %eax 1808; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1809; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1810; AVX2-NEXT: vmovd %eax, %xmm3 1811; AVX2-NEXT: vmovd %xmm1, %eax 1812; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1813; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1814; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1815; AVX2-NEXT: vmovd %xmm1, %eax 1816; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1817; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 1818; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1819; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1820; AVX2-NEXT: vmovd %xmm0, %eax 1821; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0 1822; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1823; AVX2-NEXT: vmovd %xmm0, %eax 1824; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1825; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1826; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1827; AVX2-NEXT: vmovd %xmm0, %eax 1828; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 1829; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1830; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1831; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1832; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1833; AVX2-NEXT: vmovd %xmm1, %eax 1834; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 1835; AVX2-NEXT: vmovd %xmm0, %eax 1836; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 1837; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1838; AVX2-NEXT: retq 1839; 1840; AVX512-LABEL: cvt_16f32_to_16i16: 1841; AVX512: # BB#0: 1842; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1843; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2 1844; AVX512-NEXT: vmovd %xmm2, %eax 1845; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1846; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1847; AVX512-NEXT: vmovd %eax, %xmm3 1848; AVX512-NEXT: vmovd %xmm2, %eax 1849; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1850; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1851; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1852; AVX512-NEXT: vmovd %xmm2, %eax 1853; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1854; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] 1855; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1856; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1857; AVX512-NEXT: vmovd %xmm1, %eax 1858; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 1859; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1860; AVX512-NEXT: vmovd %xmm1, %eax 1861; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 1862; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1863; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1864; AVX512-NEXT: vmovd %xmm1, %eax 1865; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1866; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1867; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1868; AVX512-NEXT: vmovd %xmm1, %eax 1869; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1870; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] 1871; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1872; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 1873; AVX512-NEXT: vmovd %xmm2, %eax 1874; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 1875; AVX512-NEXT: vmovd %xmm1, %eax 1876; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1877; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1878; AVX512-NEXT: vmovd %eax, %xmm3 1879; AVX512-NEXT: vmovd %xmm1, %eax 1880; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1881; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1882; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 1883; AVX512-NEXT: vmovd %xmm1, %eax 1884; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1885; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 1886; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1887; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 1888; AVX512-NEXT: vmovd %xmm0, %eax 1889; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0 1890; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 1891; AVX512-NEXT: vmovd %xmm0, %eax 1892; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1893; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1894; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 1895; AVX512-NEXT: vmovd %xmm0, %eax 1896; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] 1897; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1898; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 1899; AVX512-NEXT: vmovd %xmm0, %eax 1900; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] 1901; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1902; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 1903; AVX512-NEXT: vmovd %xmm0, %eax 1904; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 1905; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1906; AVX512-NEXT: retq 1907 %1 = fptrunc <16 x float> %a0 to <16 x half> 1908 %2 = bitcast <16 x half> %1 to <16 x i16> 1909 ret <16 x i16> %2 1910} 1911 1912; 1913; Float to Half (Store) 1914; 1915 1916define void @store_cvt_f32_to_i16(float %a0, i16* %a1) { 1917; ALL-LABEL: store_cvt_f32_to_i16: 1918; ALL: # BB#0: 1919; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1920; ALL-NEXT: vmovd %xmm0, %eax 1921; ALL-NEXT: movw %ax, (%rdi) 1922; ALL-NEXT: retq 1923 %1 = fptrunc float %a0 to half 1924 %2 = bitcast half %1 to i16 1925 store i16 %2, i16* %a1 1926 ret void 1927} 1928 1929define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) { 1930; ALL-LABEL: store_cvt_4f32_to_4i16: 1931; ALL: # BB#0: 1932; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1933; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1934; ALL-NEXT: vmovd %xmm1, %eax 1935; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1936; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1937; ALL-NEXT: vmovd %xmm1, %ecx 1938; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1939; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1940; ALL-NEXT: vmovd %xmm1, %edx 1941; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1942; ALL-NEXT: vmovd %xmm0, %esi 1943; ALL-NEXT: movw %si, (%rdi) 1944; ALL-NEXT: movw %dx, 6(%rdi) 1945; ALL-NEXT: movw %cx, 4(%rdi) 1946; ALL-NEXT: movw %ax, 2(%rdi) 1947; ALL-NEXT: retq 1948 %1 = fptrunc <4 x float> %a0 to <4 x half> 1949 %2 = bitcast <4 x half> %1 to <4 x i16> 1950 store <4 x i16> %2, <4 x i16>* %a1 1951 ret void 1952} 1953 1954define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) { 1955; ALL-LABEL: store_cvt_4f32_to_8i16_undef: 1956; ALL: # BB#0: 1957; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1958; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1959; ALL-NEXT: vmovd %xmm1, %eax 1960; ALL-NEXT: shll $16, %eax 1961; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1962; ALL-NEXT: vmovd %xmm1, %ecx 1963; ALL-NEXT: movzwl %cx, %ecx 1964; ALL-NEXT: orl %eax, %ecx 1965; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1966; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1967; ALL-NEXT: vmovd %xmm1, %eax 1968; ALL-NEXT: shll $16, %eax 1969; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1970; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1971; ALL-NEXT: vmovd %xmm0, %edx 1972; ALL-NEXT: movzwl %dx, %edx 1973; ALL-NEXT: orl %eax, %edx 1974; ALL-NEXT: shlq $32, %rdx 1975; ALL-NEXT: orq %rcx, %rdx 1976; ALL-NEXT: vmovq %rdx, %xmm0 1977; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1978; ALL-NEXT: vmovdqa %xmm0, (%rdi) 1979; ALL-NEXT: retq 1980 %1 = fptrunc <4 x float> %a0 to <4 x half> 1981 %2 = bitcast <4 x half> %1 to <4 x i16> 1982 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1983 store <8 x i16> %3, <8 x i16>* %a1 1984 ret void 1985} 1986 1987define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) { 1988; ALL-LABEL: store_cvt_4f32_to_8i16_zero: 1989; ALL: # BB#0: 1990; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1991; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 1992; ALL-NEXT: vmovd %xmm1, %eax 1993; ALL-NEXT: shll $16, %eax 1994; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 1995; ALL-NEXT: vmovd %xmm1, %ecx 1996; ALL-NEXT: movzwl %cx, %ecx 1997; ALL-NEXT: orl %eax, %ecx 1998; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 1999; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2000; ALL-NEXT: vmovd %xmm1, %eax 2001; ALL-NEXT: shll $16, %eax 2002; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2003; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2004; ALL-NEXT: vmovd %xmm0, %edx 2005; ALL-NEXT: movzwl %dx, %edx 2006; ALL-NEXT: orl %eax, %edx 2007; ALL-NEXT: shlq $32, %rdx 2008; ALL-NEXT: orq %rcx, %rdx 2009; ALL-NEXT: vmovq %rdx, %xmm0 2010; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 2011; ALL-NEXT: vmovdqa %xmm0, (%rdi) 2012; ALL-NEXT: retq 2013 %1 = fptrunc <4 x float> %a0 to <4 x half> 2014 %2 = bitcast <4 x half> %1 to <4 x i16> 2015 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2016 store <8 x i16> %3, <8 x i16>* %a1 2017 ret void 2018} 2019 2020define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) { 2021; AVX1-LABEL: store_cvt_8f32_to_8i16: 2022; AVX1: # BB#0: 2023; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2024; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2025; AVX1-NEXT: vmovd %xmm1, %r8d 2026; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2027; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2028; AVX1-NEXT: vmovd %xmm1, %r9d 2029; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2030; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2031; AVX1-NEXT: vmovd %xmm1, %r10d 2032; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2033; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2034; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2035; AVX1-NEXT: vmovd %xmm2, %r11d 2036; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2037; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2038; AVX1-NEXT: vmovd %xmm2, %eax 2039; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2040; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2041; AVX1-NEXT: vmovd %xmm2, %ecx 2042; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2043; AVX1-NEXT: vmovd %xmm0, %edx 2044; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2045; AVX1-NEXT: vmovd %xmm0, %esi 2046; AVX1-NEXT: movw %si, 8(%rdi) 2047; AVX1-NEXT: movw %dx, (%rdi) 2048; AVX1-NEXT: movw %cx, 14(%rdi) 2049; AVX1-NEXT: movw %ax, 12(%rdi) 2050; AVX1-NEXT: movw %r11w, 10(%rdi) 2051; AVX1-NEXT: movw %r10w, 6(%rdi) 2052; AVX1-NEXT: movw %r9w, 4(%rdi) 2053; AVX1-NEXT: movw %r8w, 2(%rdi) 2054; AVX1-NEXT: vzeroupper 2055; AVX1-NEXT: retq 2056; 2057; AVX2-LABEL: store_cvt_8f32_to_8i16: 2058; AVX2: # BB#0: 2059; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2060; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2061; AVX2-NEXT: vmovd %xmm1, %r8d 2062; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2063; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2064; AVX2-NEXT: vmovd %xmm1, %r9d 2065; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2066; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2067; AVX2-NEXT: vmovd %xmm1, %r10d 2068; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 2069; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2070; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2071; AVX2-NEXT: vmovd %xmm2, %r11d 2072; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2073; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2074; AVX2-NEXT: vmovd %xmm2, %eax 2075; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2076; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2077; AVX2-NEXT: vmovd %xmm2, %ecx 2078; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2079; AVX2-NEXT: vmovd %xmm0, %edx 2080; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2081; AVX2-NEXT: vmovd %xmm0, %esi 2082; AVX2-NEXT: movw %si, 8(%rdi) 2083; AVX2-NEXT: movw %dx, (%rdi) 2084; AVX2-NEXT: movw %cx, 14(%rdi) 2085; AVX2-NEXT: movw %ax, 12(%rdi) 2086; AVX2-NEXT: movw %r11w, 10(%rdi) 2087; AVX2-NEXT: movw %r10w, 6(%rdi) 2088; AVX2-NEXT: movw %r9w, 4(%rdi) 2089; AVX2-NEXT: movw %r8w, 2(%rdi) 2090; AVX2-NEXT: vzeroupper 2091; AVX2-NEXT: retq 2092; 2093; AVX512-LABEL: store_cvt_8f32_to_8i16: 2094; AVX512: # BB#0: 2095; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2096; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2097; AVX512-NEXT: vmovd %xmm1, %r8d 2098; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2099; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2100; AVX512-NEXT: vmovd %xmm1, %r9d 2101; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] 2102; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2103; AVX512-NEXT: vmovd %xmm1, %r10d 2104; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 2105; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2106; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2107; AVX512-NEXT: vmovd %xmm2, %r11d 2108; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 2109; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2110; AVX512-NEXT: vmovd %xmm2, %eax 2111; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2112; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2113; AVX512-NEXT: vmovd %xmm2, %ecx 2114; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2115; AVX512-NEXT: vmovd %xmm0, %edx 2116; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0 2117; AVX512-NEXT: vmovd %xmm0, %esi 2118; AVX512-NEXT: movw %si, 8(%rdi) 2119; AVX512-NEXT: movw %dx, (%rdi) 2120; AVX512-NEXT: movw %cx, 14(%rdi) 2121; AVX512-NEXT: movw %ax, 12(%rdi) 2122; AVX512-NEXT: movw %r11w, 10(%rdi) 2123; AVX512-NEXT: movw %r10w, 6(%rdi) 2124; AVX512-NEXT: movw %r9w, 4(%rdi) 2125; AVX512-NEXT: movw %r8w, 2(%rdi) 2126; AVX512-NEXT: retq 2127 %1 = fptrunc <8 x float> %a0 to <8 x half> 2128 %2 = bitcast <8 x half> %1 to <8 x i16> 2129 store <8 x i16> %2, <8 x i16>* %a1 2130 ret void 2131} 2132 2133define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) { 2134; AVX1-LABEL: store_cvt_16f32_to_16i16: 2135; AVX1: # BB#0: 2136; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2137; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2138; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2139; AVX1-NEXT: vmovd %xmm4, %eax 2140; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2141; AVX1-NEXT: movw %ax, 24(%rdi) 2142; AVX1-NEXT: vmovd %xmm4, %eax 2143; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2144; AVX1-NEXT: movw %ax, 16(%rdi) 2145; AVX1-NEXT: vmovd %xmm4, %eax 2146; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2147; AVX1-NEXT: movw %ax, 8(%rdi) 2148; AVX1-NEXT: vmovd %xmm4, %eax 2149; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2150; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2151; AVX1-NEXT: movw %ax, (%rdi) 2152; AVX1-NEXT: vmovd %xmm4, %eax 2153; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2154; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2155; AVX1-NEXT: movw %ax, 30(%rdi) 2156; AVX1-NEXT: vmovd %xmm4, %eax 2157; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2158; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2159; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2160; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2161; AVX1-NEXT: movw %ax, 28(%rdi) 2162; AVX1-NEXT: vmovd %xmm3, %eax 2163; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 2164; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2165; AVX1-NEXT: movw %ax, 26(%rdi) 2166; AVX1-NEXT: vmovd %xmm3, %eax 2167; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 2168; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2169; AVX1-NEXT: movw %ax, 22(%rdi) 2170; AVX1-NEXT: vmovd %xmm3, %eax 2171; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2172; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2173; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2174; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2175; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 2176; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2177; AVX1-NEXT: movw %ax, 20(%rdi) 2178; AVX1-NEXT: vmovd %xmm1, %eax 2179; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3] 2180; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2181; AVX1-NEXT: movw %ax, 18(%rdi) 2182; AVX1-NEXT: vmovd %xmm1, %eax 2183; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2184; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2185; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 2186; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2187; AVX1-NEXT: movw %ax, 14(%rdi) 2188; AVX1-NEXT: vmovd %xmm2, %eax 2189; AVX1-NEXT: movw %ax, 12(%rdi) 2190; AVX1-NEXT: vmovd %xmm1, %eax 2191; AVX1-NEXT: movw %ax, 10(%rdi) 2192; AVX1-NEXT: vmovd %xmm0, %eax 2193; AVX1-NEXT: movw %ax, 6(%rdi) 2194; AVX1-NEXT: vmovd %xmm3, %eax 2195; AVX1-NEXT: movw %ax, 4(%rdi) 2196; AVX1-NEXT: vmovd %xmm4, %eax 2197; AVX1-NEXT: movw %ax, 2(%rdi) 2198; AVX1-NEXT: vzeroupper 2199; AVX1-NEXT: retq 2200; 2201; AVX2-LABEL: store_cvt_16f32_to_16i16: 2202; AVX2: # BB#0: 2203; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 2204; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3 2205; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2206; AVX2-NEXT: vmovd %xmm4, %eax 2207; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2208; AVX2-NEXT: movw %ax, 24(%rdi) 2209; AVX2-NEXT: vmovd %xmm4, %eax 2210; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2211; AVX2-NEXT: movw %ax, 16(%rdi) 2212; AVX2-NEXT: vmovd %xmm4, %eax 2213; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2214; AVX2-NEXT: movw %ax, 8(%rdi) 2215; AVX2-NEXT: vmovd %xmm4, %eax 2216; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2217; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2218; AVX2-NEXT: movw %ax, (%rdi) 2219; AVX2-NEXT: vmovd %xmm4, %eax 2220; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2221; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2222; AVX2-NEXT: movw %ax, 30(%rdi) 2223; AVX2-NEXT: vmovd %xmm4, %eax 2224; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2225; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2226; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2227; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2228; AVX2-NEXT: movw %ax, 28(%rdi) 2229; AVX2-NEXT: vmovd %xmm3, %eax 2230; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 2231; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2232; AVX2-NEXT: movw %ax, 26(%rdi) 2233; AVX2-NEXT: vmovd %xmm3, %eax 2234; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 2235; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2236; AVX2-NEXT: movw %ax, 22(%rdi) 2237; AVX2-NEXT: vmovd %xmm3, %eax 2238; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2239; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2240; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2241; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2242; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 2243; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2244; AVX2-NEXT: movw %ax, 20(%rdi) 2245; AVX2-NEXT: vmovd %xmm1, %eax 2246; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3] 2247; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2248; AVX2-NEXT: movw %ax, 18(%rdi) 2249; AVX2-NEXT: vmovd %xmm1, %eax 2250; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 2251; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2252; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] 2253; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2254; AVX2-NEXT: movw %ax, 14(%rdi) 2255; AVX2-NEXT: vmovd %xmm2, %eax 2256; AVX2-NEXT: movw %ax, 12(%rdi) 2257; AVX2-NEXT: vmovd %xmm1, %eax 2258; AVX2-NEXT: movw %ax, 10(%rdi) 2259; AVX2-NEXT: vmovd %xmm0, %eax 2260; AVX2-NEXT: movw %ax, 6(%rdi) 2261; AVX2-NEXT: vmovd %xmm3, %eax 2262; AVX2-NEXT: movw %ax, 4(%rdi) 2263; AVX2-NEXT: vmovd %xmm4, %eax 2264; AVX2-NEXT: movw %ax, 2(%rdi) 2265; AVX2-NEXT: vzeroupper 2266; AVX2-NEXT: retq 2267; 2268; AVX512-LABEL: store_cvt_16f32_to_16i16: 2269; AVX512: # BB#0: 2270; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 2271; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2 2272; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3 2273; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4 2274; AVX512-NEXT: vmovd %xmm4, %eax 2275; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4 2276; AVX512-NEXT: movw %ax, 24(%rdi) 2277; AVX512-NEXT: vmovd %xmm4, %eax 2278; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4 2279; AVX512-NEXT: movw %ax, 16(%rdi) 2280; AVX512-NEXT: vmovd %xmm4, %eax 2281; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4 2282; AVX512-NEXT: movw %ax, 8(%rdi) 2283; AVX512-NEXT: vmovd %xmm4, %eax 2284; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] 2285; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2286; AVX512-NEXT: movw %ax, (%rdi) 2287; AVX512-NEXT: vmovd %xmm4, %eax 2288; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 2289; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2290; AVX512-NEXT: movw %ax, 30(%rdi) 2291; AVX512-NEXT: vmovd %xmm4, %eax 2292; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2293; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2294; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] 2295; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2296; AVX512-NEXT: movw %ax, 28(%rdi) 2297; AVX512-NEXT: vmovd %xmm3, %eax 2298; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3] 2299; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2300; AVX512-NEXT: movw %ax, 26(%rdi) 2301; AVX512-NEXT: vmovd %xmm3, %eax 2302; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] 2303; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2304; AVX512-NEXT: movw %ax, 22(%rdi) 2305; AVX512-NEXT: vmovd %xmm3, %eax 2306; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] 2307; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 2308; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] 2309; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2310; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] 2311; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2312; AVX512-NEXT: movw %ax, 20(%rdi) 2313; AVX512-NEXT: vmovd %xmm2, %eax 2314; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 2315; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2316; AVX512-NEXT: movw %ax, 18(%rdi) 2317; AVX512-NEXT: vmovd %xmm2, %eax 2318; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 2319; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2320; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 2321; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2322; AVX512-NEXT: movw %ax, 14(%rdi) 2323; AVX512-NEXT: vmovd %xmm1, %eax 2324; AVX512-NEXT: movw %ax, 12(%rdi) 2325; AVX512-NEXT: vmovd %xmm2, %eax 2326; AVX512-NEXT: movw %ax, 10(%rdi) 2327; AVX512-NEXT: vmovd %xmm0, %eax 2328; AVX512-NEXT: movw %ax, 6(%rdi) 2329; AVX512-NEXT: vmovd %xmm3, %eax 2330; AVX512-NEXT: movw %ax, 4(%rdi) 2331; AVX512-NEXT: vmovd %xmm4, %eax 2332; AVX512-NEXT: movw %ax, 2(%rdi) 2333; AVX512-NEXT: retq 2334 %1 = fptrunc <16 x float> %a0 to <16 x half> 2335 %2 = bitcast <16 x half> %1 to <16 x i16> 2336 store <16 x i16> %2, <16 x i16>* %a1 2337 ret void 2338} 2339 2340; 2341; Double to Half 2342; 2343 2344define i16 @cvt_f64_to_i16(double %a0) { 2345; ALL-LABEL: cvt_f64_to_i16: 2346; ALL: # BB#0: 2347; ALL-NEXT: jmp __truncdfhf2 # TAILCALL 2348 %1 = fptrunc double %a0 to half 2349 %2 = bitcast half %1 to i16 2350 ret i16 %2 2351} 2352 2353define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) { 2354; ALL-LABEL: cvt_2f64_to_2i16: 2355; ALL: # BB#0: 2356; ALL-NEXT: pushq %rbx 2357; ALL-NEXT: .Ltmp0: 2358; ALL-NEXT: .cfi_def_cfa_offset 16 2359; ALL-NEXT: subq $16, %rsp 2360; ALL-NEXT: .Ltmp1: 2361; ALL-NEXT: .cfi_def_cfa_offset 32 2362; ALL-NEXT: .Ltmp2: 2363; ALL-NEXT: .cfi_offset %rbx, -16 2364; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2365; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2366; ALL-NEXT: callq __truncdfhf2 2367; ALL-NEXT: movw %ax, %bx 2368; ALL-NEXT: shll $16, %ebx 2369; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2370; ALL-NEXT: callq __truncdfhf2 2371; ALL-NEXT: movzwl %ax, %eax 2372; ALL-NEXT: orl %ebx, %eax 2373; ALL-NEXT: vmovd %eax, %xmm0 2374; ALL-NEXT: addq $16, %rsp 2375; ALL-NEXT: popq %rbx 2376; ALL-NEXT: retq 2377 %1 = fptrunc <2 x double> %a0 to <2 x half> 2378 %2 = bitcast <2 x half> %1 to <2 x i16> 2379 ret <2 x i16> %2 2380} 2381 2382define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) { 2383; AVX1-LABEL: cvt_4f64_to_4i16: 2384; AVX1: # BB#0: 2385; AVX1-NEXT: pushq %r14 2386; AVX1-NEXT: .Ltmp3: 2387; AVX1-NEXT: .cfi_def_cfa_offset 16 2388; AVX1-NEXT: pushq %rbx 2389; AVX1-NEXT: .Ltmp4: 2390; AVX1-NEXT: .cfi_def_cfa_offset 24 2391; AVX1-NEXT: subq $40, %rsp 2392; AVX1-NEXT: .Ltmp5: 2393; AVX1-NEXT: .cfi_def_cfa_offset 64 2394; AVX1-NEXT: .Ltmp6: 2395; AVX1-NEXT: .cfi_offset %rbx, -24 2396; AVX1-NEXT: .Ltmp7: 2397; AVX1-NEXT: .cfi_offset %r14, -16 2398; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2399; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2400; AVX1-NEXT: vzeroupper 2401; AVX1-NEXT: callq __truncdfhf2 2402; AVX1-NEXT: movw %ax, %bx 2403; AVX1-NEXT: shll $16, %ebx 2404; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2405; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2406; AVX1-NEXT: vzeroupper 2407; AVX1-NEXT: callq __truncdfhf2 2408; AVX1-NEXT: movzwl %ax, %r14d 2409; AVX1-NEXT: orl %ebx, %r14d 2410; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2411; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2412; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2413; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2414; AVX1-NEXT: vzeroupper 2415; AVX1-NEXT: callq __truncdfhf2 2416; AVX1-NEXT: movw %ax, %bx 2417; AVX1-NEXT: shll $16, %ebx 2418; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2419; AVX1-NEXT: callq __truncdfhf2 2420; AVX1-NEXT: movzwl %ax, %eax 2421; AVX1-NEXT: orl %ebx, %eax 2422; AVX1-NEXT: shlq $32, %rax 2423; AVX1-NEXT: orq %r14, %rax 2424; AVX1-NEXT: vmovq %rax, %xmm0 2425; AVX1-NEXT: addq $40, %rsp 2426; AVX1-NEXT: popq %rbx 2427; AVX1-NEXT: popq %r14 2428; AVX1-NEXT: retq 2429; 2430; AVX2-LABEL: cvt_4f64_to_4i16: 2431; AVX2: # BB#0: 2432; AVX2-NEXT: pushq %r14 2433; AVX2-NEXT: .Ltmp3: 2434; AVX2-NEXT: .cfi_def_cfa_offset 16 2435; AVX2-NEXT: pushq %rbx 2436; AVX2-NEXT: .Ltmp4: 2437; AVX2-NEXT: .cfi_def_cfa_offset 24 2438; AVX2-NEXT: subq $40, %rsp 2439; AVX2-NEXT: .Ltmp5: 2440; AVX2-NEXT: .cfi_def_cfa_offset 64 2441; AVX2-NEXT: .Ltmp6: 2442; AVX2-NEXT: .cfi_offset %rbx, -24 2443; AVX2-NEXT: .Ltmp7: 2444; AVX2-NEXT: .cfi_offset %r14, -16 2445; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2446; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2447; AVX2-NEXT: vzeroupper 2448; AVX2-NEXT: callq __truncdfhf2 2449; AVX2-NEXT: movw %ax, %bx 2450; AVX2-NEXT: shll $16, %ebx 2451; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2452; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2453; AVX2-NEXT: vzeroupper 2454; AVX2-NEXT: callq __truncdfhf2 2455; AVX2-NEXT: movzwl %ax, %r14d 2456; AVX2-NEXT: orl %ebx, %r14d 2457; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2458; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2459; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2460; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2461; AVX2-NEXT: vzeroupper 2462; AVX2-NEXT: callq __truncdfhf2 2463; AVX2-NEXT: movw %ax, %bx 2464; AVX2-NEXT: shll $16, %ebx 2465; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2466; AVX2-NEXT: callq __truncdfhf2 2467; AVX2-NEXT: movzwl %ax, %eax 2468; AVX2-NEXT: orl %ebx, %eax 2469; AVX2-NEXT: shlq $32, %rax 2470; AVX2-NEXT: orq %r14, %rax 2471; AVX2-NEXT: vmovq %rax, %xmm0 2472; AVX2-NEXT: addq $40, %rsp 2473; AVX2-NEXT: popq %rbx 2474; AVX2-NEXT: popq %r14 2475; AVX2-NEXT: retq 2476; 2477; AVX512-LABEL: cvt_4f64_to_4i16: 2478; AVX512: # BB#0: 2479; AVX512-NEXT: pushq %r14 2480; AVX512-NEXT: .Ltmp3: 2481; AVX512-NEXT: .cfi_def_cfa_offset 16 2482; AVX512-NEXT: pushq %rbx 2483; AVX512-NEXT: .Ltmp4: 2484; AVX512-NEXT: .cfi_def_cfa_offset 24 2485; AVX512-NEXT: subq $40, %rsp 2486; AVX512-NEXT: .Ltmp5: 2487; AVX512-NEXT: .cfi_def_cfa_offset 64 2488; AVX512-NEXT: .Ltmp6: 2489; AVX512-NEXT: .cfi_offset %rbx, -24 2490; AVX512-NEXT: .Ltmp7: 2491; AVX512-NEXT: .cfi_offset %r14, -16 2492; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2493; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2494; AVX512-NEXT: callq __truncdfhf2 2495; AVX512-NEXT: movw %ax, %bx 2496; AVX512-NEXT: shll $16, %ebx 2497; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2498; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2499; AVX512-NEXT: callq __truncdfhf2 2500; AVX512-NEXT: movzwl %ax, %r14d 2501; AVX512-NEXT: orl %ebx, %r14d 2502; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2503; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 2504; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2505; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2506; AVX512-NEXT: callq __truncdfhf2 2507; AVX512-NEXT: movw %ax, %bx 2508; AVX512-NEXT: shll $16, %ebx 2509; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2510; AVX512-NEXT: callq __truncdfhf2 2511; AVX512-NEXT: movzwl %ax, %eax 2512; AVX512-NEXT: orl %ebx, %eax 2513; AVX512-NEXT: shlq $32, %rax 2514; AVX512-NEXT: orq %r14, %rax 2515; AVX512-NEXT: vmovq %rax, %xmm0 2516; AVX512-NEXT: addq $40, %rsp 2517; AVX512-NEXT: popq %rbx 2518; AVX512-NEXT: popq %r14 2519; AVX512-NEXT: retq 2520 %1 = fptrunc <4 x double> %a0 to <4 x half> 2521 %2 = bitcast <4 x half> %1 to <4 x i16> 2522 ret <4 x i16> %2 2523} 2524 2525define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) { 2526; AVX1-LABEL: cvt_4f64_to_8i16_undef: 2527; AVX1: # BB#0: 2528; AVX1-NEXT: pushq %r14 2529; AVX1-NEXT: .Ltmp8: 2530; AVX1-NEXT: .cfi_def_cfa_offset 16 2531; AVX1-NEXT: pushq %rbx 2532; AVX1-NEXT: .Ltmp9: 2533; AVX1-NEXT: .cfi_def_cfa_offset 24 2534; AVX1-NEXT: subq $40, %rsp 2535; AVX1-NEXT: .Ltmp10: 2536; AVX1-NEXT: .cfi_def_cfa_offset 64 2537; AVX1-NEXT: .Ltmp11: 2538; AVX1-NEXT: .cfi_offset %rbx, -24 2539; AVX1-NEXT: .Ltmp12: 2540; AVX1-NEXT: .cfi_offset %r14, -16 2541; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2542; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2543; AVX1-NEXT: vzeroupper 2544; AVX1-NEXT: callq __truncdfhf2 2545; AVX1-NEXT: movw %ax, %bx 2546; AVX1-NEXT: shll $16, %ebx 2547; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2548; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2549; AVX1-NEXT: vzeroupper 2550; AVX1-NEXT: callq __truncdfhf2 2551; AVX1-NEXT: movzwl %ax, %r14d 2552; AVX1-NEXT: orl %ebx, %r14d 2553; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2554; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2555; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2556; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2557; AVX1-NEXT: vzeroupper 2558; AVX1-NEXT: callq __truncdfhf2 2559; AVX1-NEXT: movw %ax, %bx 2560; AVX1-NEXT: shll $16, %ebx 2561; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2562; AVX1-NEXT: callq __truncdfhf2 2563; AVX1-NEXT: movzwl %ax, %eax 2564; AVX1-NEXT: orl %ebx, %eax 2565; AVX1-NEXT: shlq $32, %rax 2566; AVX1-NEXT: orq %r14, %rax 2567; AVX1-NEXT: vmovq %rax, %xmm0 2568; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2569; AVX1-NEXT: addq $40, %rsp 2570; AVX1-NEXT: popq %rbx 2571; AVX1-NEXT: popq %r14 2572; AVX1-NEXT: retq 2573; 2574; AVX2-LABEL: cvt_4f64_to_8i16_undef: 2575; AVX2: # BB#0: 2576; AVX2-NEXT: pushq %r14 2577; AVX2-NEXT: .Ltmp8: 2578; AVX2-NEXT: .cfi_def_cfa_offset 16 2579; AVX2-NEXT: pushq %rbx 2580; AVX2-NEXT: .Ltmp9: 2581; AVX2-NEXT: .cfi_def_cfa_offset 24 2582; AVX2-NEXT: subq $40, %rsp 2583; AVX2-NEXT: .Ltmp10: 2584; AVX2-NEXT: .cfi_def_cfa_offset 64 2585; AVX2-NEXT: .Ltmp11: 2586; AVX2-NEXT: .cfi_offset %rbx, -24 2587; AVX2-NEXT: .Ltmp12: 2588; AVX2-NEXT: .cfi_offset %r14, -16 2589; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2590; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2591; AVX2-NEXT: vzeroupper 2592; AVX2-NEXT: callq __truncdfhf2 2593; AVX2-NEXT: movw %ax, %bx 2594; AVX2-NEXT: shll $16, %ebx 2595; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2596; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2597; AVX2-NEXT: vzeroupper 2598; AVX2-NEXT: callq __truncdfhf2 2599; AVX2-NEXT: movzwl %ax, %r14d 2600; AVX2-NEXT: orl %ebx, %r14d 2601; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2602; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2603; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2604; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2605; AVX2-NEXT: vzeroupper 2606; AVX2-NEXT: callq __truncdfhf2 2607; AVX2-NEXT: movw %ax, %bx 2608; AVX2-NEXT: shll $16, %ebx 2609; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2610; AVX2-NEXT: callq __truncdfhf2 2611; AVX2-NEXT: movzwl %ax, %eax 2612; AVX2-NEXT: orl %ebx, %eax 2613; AVX2-NEXT: shlq $32, %rax 2614; AVX2-NEXT: orq %r14, %rax 2615; AVX2-NEXT: vmovq %rax, %xmm0 2616; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2617; AVX2-NEXT: addq $40, %rsp 2618; AVX2-NEXT: popq %rbx 2619; AVX2-NEXT: popq %r14 2620; AVX2-NEXT: retq 2621; 2622; AVX512-LABEL: cvt_4f64_to_8i16_undef: 2623; AVX512: # BB#0: 2624; AVX512-NEXT: pushq %r14 2625; AVX512-NEXT: .Ltmp8: 2626; AVX512-NEXT: .cfi_def_cfa_offset 16 2627; AVX512-NEXT: pushq %rbx 2628; AVX512-NEXT: .Ltmp9: 2629; AVX512-NEXT: .cfi_def_cfa_offset 24 2630; AVX512-NEXT: subq $40, %rsp 2631; AVX512-NEXT: .Ltmp10: 2632; AVX512-NEXT: .cfi_def_cfa_offset 64 2633; AVX512-NEXT: .Ltmp11: 2634; AVX512-NEXT: .cfi_offset %rbx, -24 2635; AVX512-NEXT: .Ltmp12: 2636; AVX512-NEXT: .cfi_offset %r14, -16 2637; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2638; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2639; AVX512-NEXT: callq __truncdfhf2 2640; AVX512-NEXT: movw %ax, %bx 2641; AVX512-NEXT: shll $16, %ebx 2642; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2643; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2644; AVX512-NEXT: callq __truncdfhf2 2645; AVX512-NEXT: movzwl %ax, %r14d 2646; AVX512-NEXT: orl %ebx, %r14d 2647; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2648; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 2649; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2650; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2651; AVX512-NEXT: callq __truncdfhf2 2652; AVX512-NEXT: movw %ax, %bx 2653; AVX512-NEXT: shll $16, %ebx 2654; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2655; AVX512-NEXT: callq __truncdfhf2 2656; AVX512-NEXT: movzwl %ax, %eax 2657; AVX512-NEXT: orl %ebx, %eax 2658; AVX512-NEXT: shlq $32, %rax 2659; AVX512-NEXT: orq %r14, %rax 2660; AVX512-NEXT: vmovq %rax, %xmm0 2661; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2662; AVX512-NEXT: addq $40, %rsp 2663; AVX512-NEXT: popq %rbx 2664; AVX512-NEXT: popq %r14 2665; AVX512-NEXT: retq 2666 %1 = fptrunc <4 x double> %a0 to <4 x half> 2667 %2 = bitcast <4 x half> %1 to <4 x i16> 2668 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2669 ret <8 x i16> %3 2670} 2671 2672define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) { 2673; AVX1-LABEL: cvt_4f64_to_8i16_zero: 2674; AVX1: # BB#0: 2675; AVX1-NEXT: pushq %r14 2676; AVX1-NEXT: .Ltmp13: 2677; AVX1-NEXT: .cfi_def_cfa_offset 16 2678; AVX1-NEXT: pushq %rbx 2679; AVX1-NEXT: .Ltmp14: 2680; AVX1-NEXT: .cfi_def_cfa_offset 24 2681; AVX1-NEXT: subq $40, %rsp 2682; AVX1-NEXT: .Ltmp15: 2683; AVX1-NEXT: .cfi_def_cfa_offset 64 2684; AVX1-NEXT: .Ltmp16: 2685; AVX1-NEXT: .cfi_offset %rbx, -24 2686; AVX1-NEXT: .Ltmp17: 2687; AVX1-NEXT: .cfi_offset %r14, -16 2688; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2689; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2690; AVX1-NEXT: vzeroupper 2691; AVX1-NEXT: callq __truncdfhf2 2692; AVX1-NEXT: movw %ax, %bx 2693; AVX1-NEXT: shll $16, %ebx 2694; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2695; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2696; AVX1-NEXT: vzeroupper 2697; AVX1-NEXT: callq __truncdfhf2 2698; AVX1-NEXT: movzwl %ax, %r14d 2699; AVX1-NEXT: orl %ebx, %r14d 2700; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2701; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2702; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2703; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2704; AVX1-NEXT: vzeroupper 2705; AVX1-NEXT: callq __truncdfhf2 2706; AVX1-NEXT: movw %ax, %bx 2707; AVX1-NEXT: shll $16, %ebx 2708; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2709; AVX1-NEXT: callq __truncdfhf2 2710; AVX1-NEXT: movzwl %ax, %eax 2711; AVX1-NEXT: orl %ebx, %eax 2712; AVX1-NEXT: shlq $32, %rax 2713; AVX1-NEXT: orq %r14, %rax 2714; AVX1-NEXT: vmovq %rax, %xmm0 2715; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 2716; AVX1-NEXT: addq $40, %rsp 2717; AVX1-NEXT: popq %rbx 2718; AVX1-NEXT: popq %r14 2719; AVX1-NEXT: retq 2720; 2721; AVX2-LABEL: cvt_4f64_to_8i16_zero: 2722; AVX2: # BB#0: 2723; AVX2-NEXT: pushq %r14 2724; AVX2-NEXT: .Ltmp13: 2725; AVX2-NEXT: .cfi_def_cfa_offset 16 2726; AVX2-NEXT: pushq %rbx 2727; AVX2-NEXT: .Ltmp14: 2728; AVX2-NEXT: .cfi_def_cfa_offset 24 2729; AVX2-NEXT: subq $40, %rsp 2730; AVX2-NEXT: .Ltmp15: 2731; AVX2-NEXT: .cfi_def_cfa_offset 64 2732; AVX2-NEXT: .Ltmp16: 2733; AVX2-NEXT: .cfi_offset %rbx, -24 2734; AVX2-NEXT: .Ltmp17: 2735; AVX2-NEXT: .cfi_offset %r14, -16 2736; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2737; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2738; AVX2-NEXT: vzeroupper 2739; AVX2-NEXT: callq __truncdfhf2 2740; AVX2-NEXT: movw %ax, %bx 2741; AVX2-NEXT: shll $16, %ebx 2742; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2743; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2744; AVX2-NEXT: vzeroupper 2745; AVX2-NEXT: callq __truncdfhf2 2746; AVX2-NEXT: movzwl %ax, %r14d 2747; AVX2-NEXT: orl %ebx, %r14d 2748; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2749; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2750; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2751; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2752; AVX2-NEXT: vzeroupper 2753; AVX2-NEXT: callq __truncdfhf2 2754; AVX2-NEXT: movw %ax, %bx 2755; AVX2-NEXT: shll $16, %ebx 2756; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2757; AVX2-NEXT: callq __truncdfhf2 2758; AVX2-NEXT: movzwl %ax, %eax 2759; AVX2-NEXT: orl %ebx, %eax 2760; AVX2-NEXT: shlq $32, %rax 2761; AVX2-NEXT: orq %r14, %rax 2762; AVX2-NEXT: vmovq %rax, %xmm0 2763; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 2764; AVX2-NEXT: addq $40, %rsp 2765; AVX2-NEXT: popq %rbx 2766; AVX2-NEXT: popq %r14 2767; AVX2-NEXT: retq 2768; 2769; AVX512-LABEL: cvt_4f64_to_8i16_zero: 2770; AVX512: # BB#0: 2771; AVX512-NEXT: pushq %r14 2772; AVX512-NEXT: .Ltmp13: 2773; AVX512-NEXT: .cfi_def_cfa_offset 16 2774; AVX512-NEXT: pushq %rbx 2775; AVX512-NEXT: .Ltmp14: 2776; AVX512-NEXT: .cfi_def_cfa_offset 24 2777; AVX512-NEXT: subq $40, %rsp 2778; AVX512-NEXT: .Ltmp15: 2779; AVX512-NEXT: .cfi_def_cfa_offset 64 2780; AVX512-NEXT: .Ltmp16: 2781; AVX512-NEXT: .cfi_offset %rbx, -24 2782; AVX512-NEXT: .Ltmp17: 2783; AVX512-NEXT: .cfi_offset %r14, -16 2784; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 2785; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2786; AVX512-NEXT: callq __truncdfhf2 2787; AVX512-NEXT: movw %ax, %bx 2788; AVX512-NEXT: shll $16, %ebx 2789; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2790; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2791; AVX512-NEXT: callq __truncdfhf2 2792; AVX512-NEXT: movzwl %ax, %r14d 2793; AVX512-NEXT: orl %ebx, %r14d 2794; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2795; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 2796; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2797; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2798; AVX512-NEXT: callq __truncdfhf2 2799; AVX512-NEXT: movw %ax, %bx 2800; AVX512-NEXT: shll $16, %ebx 2801; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2802; AVX512-NEXT: callq __truncdfhf2 2803; AVX512-NEXT: movzwl %ax, %eax 2804; AVX512-NEXT: orl %ebx, %eax 2805; AVX512-NEXT: shlq $32, %rax 2806; AVX512-NEXT: orq %r14, %rax 2807; AVX512-NEXT: vmovq %rax, %xmm0 2808; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 2809; AVX512-NEXT: addq $40, %rsp 2810; AVX512-NEXT: popq %rbx 2811; AVX512-NEXT: popq %r14 2812; AVX512-NEXT: retq 2813 %1 = fptrunc <4 x double> %a0 to <4 x half> 2814 %2 = bitcast <4 x half> %1 to <4 x i16> 2815 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2816 ret <8 x i16> %3 2817} 2818 2819define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { 2820; AVX1-LABEL: cvt_8f64_to_8i16: 2821; AVX1: # BB#0: 2822; AVX1-NEXT: pushq %r15 2823; AVX1-NEXT: .Ltmp18: 2824; AVX1-NEXT: .cfi_def_cfa_offset 16 2825; AVX1-NEXT: pushq %r14 2826; AVX1-NEXT: .Ltmp19: 2827; AVX1-NEXT: .cfi_def_cfa_offset 24 2828; AVX1-NEXT: pushq %rbx 2829; AVX1-NEXT: .Ltmp20: 2830; AVX1-NEXT: .cfi_def_cfa_offset 32 2831; AVX1-NEXT: subq $64, %rsp 2832; AVX1-NEXT: .Ltmp21: 2833; AVX1-NEXT: .cfi_def_cfa_offset 96 2834; AVX1-NEXT: .Ltmp22: 2835; AVX1-NEXT: .cfi_offset %rbx, -32 2836; AVX1-NEXT: .Ltmp23: 2837; AVX1-NEXT: .cfi_offset %r14, -24 2838; AVX1-NEXT: .Ltmp24: 2839; AVX1-NEXT: .cfi_offset %r15, -16 2840; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 2841; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 2842; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2843; AVX1-NEXT: vzeroupper 2844; AVX1-NEXT: callq __truncdfhf2 2845; AVX1-NEXT: movw %ax, %bx 2846; AVX1-NEXT: shll $16, %ebx 2847; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 2848; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2849; AVX1-NEXT: vzeroupper 2850; AVX1-NEXT: callq __truncdfhf2 2851; AVX1-NEXT: movzwl %ax, %r15d 2852; AVX1-NEXT: orl %ebx, %r15d 2853; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 2854; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2855; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 2856; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2857; AVX1-NEXT: vzeroupper 2858; AVX1-NEXT: callq __truncdfhf2 2859; AVX1-NEXT: movw %ax, %bx 2860; AVX1-NEXT: shll $16, %ebx 2861; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 2862; AVX1-NEXT: callq __truncdfhf2 2863; AVX1-NEXT: movzwl %ax, %r14d 2864; AVX1-NEXT: orl %ebx, %r14d 2865; AVX1-NEXT: shlq $32, %r14 2866; AVX1-NEXT: orq %r15, %r14 2867; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2868; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2869; AVX1-NEXT: vzeroupper 2870; AVX1-NEXT: callq __truncdfhf2 2871; AVX1-NEXT: movw %ax, %bx 2872; AVX1-NEXT: shll $16, %ebx 2873; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2874; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2875; AVX1-NEXT: vzeroupper 2876; AVX1-NEXT: callq __truncdfhf2 2877; AVX1-NEXT: movzwl %ax, %r15d 2878; AVX1-NEXT: orl %ebx, %r15d 2879; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2880; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2881; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2882; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2883; AVX1-NEXT: vzeroupper 2884; AVX1-NEXT: callq __truncdfhf2 2885; AVX1-NEXT: movw %ax, %bx 2886; AVX1-NEXT: shll $16, %ebx 2887; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2888; AVX1-NEXT: callq __truncdfhf2 2889; AVX1-NEXT: movzwl %ax, %eax 2890; AVX1-NEXT: orl %ebx, %eax 2891; AVX1-NEXT: shlq $32, %rax 2892; AVX1-NEXT: orq %r15, %rax 2893; AVX1-NEXT: vmovq %rax, %xmm0 2894; AVX1-NEXT: vmovq %r14, %xmm1 2895; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2896; AVX1-NEXT: addq $64, %rsp 2897; AVX1-NEXT: popq %rbx 2898; AVX1-NEXT: popq %r14 2899; AVX1-NEXT: popq %r15 2900; AVX1-NEXT: retq 2901; 2902; AVX2-LABEL: cvt_8f64_to_8i16: 2903; AVX2: # BB#0: 2904; AVX2-NEXT: pushq %r15 2905; AVX2-NEXT: .Ltmp18: 2906; AVX2-NEXT: .cfi_def_cfa_offset 16 2907; AVX2-NEXT: pushq %r14 2908; AVX2-NEXT: .Ltmp19: 2909; AVX2-NEXT: .cfi_def_cfa_offset 24 2910; AVX2-NEXT: pushq %rbx 2911; AVX2-NEXT: .Ltmp20: 2912; AVX2-NEXT: .cfi_def_cfa_offset 32 2913; AVX2-NEXT: subq $64, %rsp 2914; AVX2-NEXT: .Ltmp21: 2915; AVX2-NEXT: .cfi_def_cfa_offset 96 2916; AVX2-NEXT: .Ltmp22: 2917; AVX2-NEXT: .cfi_offset %rbx, -32 2918; AVX2-NEXT: .Ltmp23: 2919; AVX2-NEXT: .cfi_offset %r14, -24 2920; AVX2-NEXT: .Ltmp24: 2921; AVX2-NEXT: .cfi_offset %r15, -16 2922; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill 2923; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 2924; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2925; AVX2-NEXT: vzeroupper 2926; AVX2-NEXT: callq __truncdfhf2 2927; AVX2-NEXT: movw %ax, %bx 2928; AVX2-NEXT: shll $16, %ebx 2929; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 2930; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2931; AVX2-NEXT: vzeroupper 2932; AVX2-NEXT: callq __truncdfhf2 2933; AVX2-NEXT: movzwl %ax, %r15d 2934; AVX2-NEXT: orl %ebx, %r15d 2935; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 2936; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2937; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 2938; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2939; AVX2-NEXT: vzeroupper 2940; AVX2-NEXT: callq __truncdfhf2 2941; AVX2-NEXT: movw %ax, %bx 2942; AVX2-NEXT: shll $16, %ebx 2943; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 2944; AVX2-NEXT: callq __truncdfhf2 2945; AVX2-NEXT: movzwl %ax, %r14d 2946; AVX2-NEXT: orl %ebx, %r14d 2947; AVX2-NEXT: shlq $32, %r14 2948; AVX2-NEXT: orq %r15, %r14 2949; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2950; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2951; AVX2-NEXT: vzeroupper 2952; AVX2-NEXT: callq __truncdfhf2 2953; AVX2-NEXT: movw %ax, %bx 2954; AVX2-NEXT: shll $16, %ebx 2955; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 2956; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 2957; AVX2-NEXT: vzeroupper 2958; AVX2-NEXT: callq __truncdfhf2 2959; AVX2-NEXT: movzwl %ax, %r15d 2960; AVX2-NEXT: orl %ebx, %r15d 2961; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 2962; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 2963; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 2964; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 2965; AVX2-NEXT: vzeroupper 2966; AVX2-NEXT: callq __truncdfhf2 2967; AVX2-NEXT: movw %ax, %bx 2968; AVX2-NEXT: shll $16, %ebx 2969; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 2970; AVX2-NEXT: callq __truncdfhf2 2971; AVX2-NEXT: movzwl %ax, %eax 2972; AVX2-NEXT: orl %ebx, %eax 2973; AVX2-NEXT: shlq $32, %rax 2974; AVX2-NEXT: orq %r15, %rax 2975; AVX2-NEXT: vmovq %rax, %xmm0 2976; AVX2-NEXT: vmovq %r14, %xmm1 2977; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2978; AVX2-NEXT: addq $64, %rsp 2979; AVX2-NEXT: popq %rbx 2980; AVX2-NEXT: popq %r14 2981; AVX2-NEXT: popq %r15 2982; AVX2-NEXT: retq 2983; 2984; AVX512-LABEL: cvt_8f64_to_8i16: 2985; AVX512: # BB#0: 2986; AVX512-NEXT: pushq %r15 2987; AVX512-NEXT: .Ltmp18: 2988; AVX512-NEXT: .cfi_def_cfa_offset 16 2989; AVX512-NEXT: pushq %r14 2990; AVX512-NEXT: .Ltmp19: 2991; AVX512-NEXT: .cfi_def_cfa_offset 24 2992; AVX512-NEXT: pushq %rbx 2993; AVX512-NEXT: .Ltmp20: 2994; AVX512-NEXT: .cfi_def_cfa_offset 32 2995; AVX512-NEXT: subq $96, %rsp 2996; AVX512-NEXT: .Ltmp21: 2997; AVX512-NEXT: .cfi_def_cfa_offset 128 2998; AVX512-NEXT: .Ltmp22: 2999; AVX512-NEXT: .cfi_offset %rbx, -32 3000; AVX512-NEXT: .Ltmp23: 3001; AVX512-NEXT: .cfi_offset %r14, -24 3002; AVX512-NEXT: .Ltmp24: 3003; AVX512-NEXT: .cfi_offset %r15, -16 3004; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill 3005; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3006; AVX512-NEXT: callq __truncdfhf2 3007; AVX512-NEXT: movw %ax, %bx 3008; AVX512-NEXT: shll $16, %ebx 3009; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 3010; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 3011; AVX512-NEXT: callq __truncdfhf2 3012; AVX512-NEXT: movzwl %ax, %r15d 3013; AVX512-NEXT: orl %ebx, %r15d 3014; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 3015; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3016; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3017; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3018; AVX512-NEXT: callq __truncdfhf2 3019; AVX512-NEXT: movw %ax, %bx 3020; AVX512-NEXT: shll $16, %ebx 3021; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3022; AVX512-NEXT: callq __truncdfhf2 3023; AVX512-NEXT: movzwl %ax, %r14d 3024; AVX512-NEXT: orl %ebx, %r14d 3025; AVX512-NEXT: shlq $32, %r14 3026; AVX512-NEXT: orq %r15, %r14 3027; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload 3028; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 3029; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3030; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3031; AVX512-NEXT: callq __truncdfhf2 3032; AVX512-NEXT: movw %ax, %bx 3033; AVX512-NEXT: shll $16, %ebx 3034; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3035; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3036; AVX512-NEXT: callq __truncdfhf2 3037; AVX512-NEXT: movzwl %ax, %r15d 3038; AVX512-NEXT: orl %ebx, %r15d 3039; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3040; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3041; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3042; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3043; AVX512-NEXT: callq __truncdfhf2 3044; AVX512-NEXT: movw %ax, %bx 3045; AVX512-NEXT: shll $16, %ebx 3046; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3047; AVX512-NEXT: callq __truncdfhf2 3048; AVX512-NEXT: movzwl %ax, %eax 3049; AVX512-NEXT: orl %ebx, %eax 3050; AVX512-NEXT: shlq $32, %rax 3051; AVX512-NEXT: orq %r15, %rax 3052; AVX512-NEXT: vmovq %rax, %xmm0 3053; AVX512-NEXT: vmovq %r14, %xmm1 3054; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 3055; AVX512-NEXT: addq $96, %rsp 3056; AVX512-NEXT: popq %rbx 3057; AVX512-NEXT: popq %r14 3058; AVX512-NEXT: popq %r15 3059; AVX512-NEXT: retq 3060 %1 = fptrunc <8 x double> %a0 to <8 x half> 3061 %2 = bitcast <8 x half> %1 to <8 x i16> 3062 ret <8 x i16> %2 3063} 3064 3065; 3066; Double to Half (Store) 3067; 3068 3069define void @store_cvt_f64_to_i16(double %a0, i16* %a1) { 3070; ALL-LABEL: store_cvt_f64_to_i16: 3071; ALL: # BB#0: 3072; ALL-NEXT: pushq %rbx 3073; ALL-NEXT: .Ltmp25: 3074; ALL-NEXT: .cfi_def_cfa_offset 16 3075; ALL-NEXT: .Ltmp26: 3076; ALL-NEXT: .cfi_offset %rbx, -16 3077; ALL-NEXT: movq %rdi, %rbx 3078; ALL-NEXT: callq __truncdfhf2 3079; ALL-NEXT: movw %ax, (%rbx) 3080; ALL-NEXT: popq %rbx 3081; ALL-NEXT: retq 3082 %1 = fptrunc double %a0 to half 3083 %2 = bitcast half %1 to i16 3084 store i16 %2, i16* %a1 3085 ret void 3086} 3087 3088define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) { 3089; ALL-LABEL: store_cvt_2f64_to_2i16: 3090; ALL: # BB#0: 3091; ALL-NEXT: pushq %rbp 3092; ALL-NEXT: .Ltmp27: 3093; ALL-NEXT: .cfi_def_cfa_offset 16 3094; ALL-NEXT: pushq %rbx 3095; ALL-NEXT: .Ltmp28: 3096; ALL-NEXT: .cfi_def_cfa_offset 24 3097; ALL-NEXT: subq $24, %rsp 3098; ALL-NEXT: .Ltmp29: 3099; ALL-NEXT: .cfi_def_cfa_offset 48 3100; ALL-NEXT: .Ltmp30: 3101; ALL-NEXT: .cfi_offset %rbx, -24 3102; ALL-NEXT: .Ltmp31: 3103; ALL-NEXT: .cfi_offset %rbp, -16 3104; ALL-NEXT: movq %rdi, %rbx 3105; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3106; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3107; ALL-NEXT: callq __truncdfhf2 3108; ALL-NEXT: movl %eax, %ebp 3109; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3110; ALL-NEXT: callq __truncdfhf2 3111; ALL-NEXT: movw %ax, (%rbx) 3112; ALL-NEXT: movw %bp, 2(%rbx) 3113; ALL-NEXT: addq $24, %rsp 3114; ALL-NEXT: popq %rbx 3115; ALL-NEXT: popq %rbp 3116; ALL-NEXT: retq 3117 %1 = fptrunc <2 x double> %a0 to <2 x half> 3118 %2 = bitcast <2 x half> %1 to <2 x i16> 3119 store <2 x i16> %2, <2 x i16>* %a1 3120 ret void 3121} 3122 3123define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) { 3124; AVX1-LABEL: store_cvt_4f64_to_4i16: 3125; AVX1: # BB#0: 3126; AVX1-NEXT: pushq %rbp 3127; AVX1-NEXT: .Ltmp32: 3128; AVX1-NEXT: .cfi_def_cfa_offset 16 3129; AVX1-NEXT: pushq %r15 3130; AVX1-NEXT: .Ltmp33: 3131; AVX1-NEXT: .cfi_def_cfa_offset 24 3132; AVX1-NEXT: pushq %r14 3133; AVX1-NEXT: .Ltmp34: 3134; AVX1-NEXT: .cfi_def_cfa_offset 32 3135; AVX1-NEXT: pushq %rbx 3136; AVX1-NEXT: .Ltmp35: 3137; AVX1-NEXT: .cfi_def_cfa_offset 40 3138; AVX1-NEXT: subq $88, %rsp 3139; AVX1-NEXT: .Ltmp36: 3140; AVX1-NEXT: .cfi_def_cfa_offset 128 3141; AVX1-NEXT: .Ltmp37: 3142; AVX1-NEXT: .cfi_offset %rbx, -40 3143; AVX1-NEXT: .Ltmp38: 3144; AVX1-NEXT: .cfi_offset %r14, -32 3145; AVX1-NEXT: .Ltmp39: 3146; AVX1-NEXT: .cfi_offset %r15, -24 3147; AVX1-NEXT: .Ltmp40: 3148; AVX1-NEXT: .cfi_offset %rbp, -16 3149; AVX1-NEXT: movq %rdi, %rbx 3150; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3151; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3152; AVX1-NEXT: vzeroupper 3153; AVX1-NEXT: callq __truncdfhf2 3154; AVX1-NEXT: movl %eax, %r14d 3155; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3156; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3157; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3158; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3159; AVX1-NEXT: vzeroupper 3160; AVX1-NEXT: callq __truncdfhf2 3161; AVX1-NEXT: movl %eax, %r15d 3162; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3163; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3164; AVX1-NEXT: vzeroupper 3165; AVX1-NEXT: callq __truncdfhf2 3166; AVX1-NEXT: movl %eax, %ebp 3167; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3168; AVX1-NEXT: callq __truncdfhf2 3169; AVX1-NEXT: movw %ax, 4(%rbx) 3170; AVX1-NEXT: movw %bp, (%rbx) 3171; AVX1-NEXT: movw %r15w, 6(%rbx) 3172; AVX1-NEXT: movw %r14w, 2(%rbx) 3173; AVX1-NEXT: addq $88, %rsp 3174; AVX1-NEXT: popq %rbx 3175; AVX1-NEXT: popq %r14 3176; AVX1-NEXT: popq %r15 3177; AVX1-NEXT: popq %rbp 3178; AVX1-NEXT: retq 3179; 3180; AVX2-LABEL: store_cvt_4f64_to_4i16: 3181; AVX2: # BB#0: 3182; AVX2-NEXT: pushq %rbp 3183; AVX2-NEXT: .Ltmp32: 3184; AVX2-NEXT: .cfi_def_cfa_offset 16 3185; AVX2-NEXT: pushq %r15 3186; AVX2-NEXT: .Ltmp33: 3187; AVX2-NEXT: .cfi_def_cfa_offset 24 3188; AVX2-NEXT: pushq %r14 3189; AVX2-NEXT: .Ltmp34: 3190; AVX2-NEXT: .cfi_def_cfa_offset 32 3191; AVX2-NEXT: pushq %rbx 3192; AVX2-NEXT: .Ltmp35: 3193; AVX2-NEXT: .cfi_def_cfa_offset 40 3194; AVX2-NEXT: subq $88, %rsp 3195; AVX2-NEXT: .Ltmp36: 3196; AVX2-NEXT: .cfi_def_cfa_offset 128 3197; AVX2-NEXT: .Ltmp37: 3198; AVX2-NEXT: .cfi_offset %rbx, -40 3199; AVX2-NEXT: .Ltmp38: 3200; AVX2-NEXT: .cfi_offset %r14, -32 3201; AVX2-NEXT: .Ltmp39: 3202; AVX2-NEXT: .cfi_offset %r15, -24 3203; AVX2-NEXT: .Ltmp40: 3204; AVX2-NEXT: .cfi_offset %rbp, -16 3205; AVX2-NEXT: movq %rdi, %rbx 3206; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3207; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3208; AVX2-NEXT: vzeroupper 3209; AVX2-NEXT: callq __truncdfhf2 3210; AVX2-NEXT: movl %eax, %r14d 3211; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3212; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3213; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3214; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3215; AVX2-NEXT: vzeroupper 3216; AVX2-NEXT: callq __truncdfhf2 3217; AVX2-NEXT: movl %eax, %r15d 3218; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3219; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3220; AVX2-NEXT: vzeroupper 3221; AVX2-NEXT: callq __truncdfhf2 3222; AVX2-NEXT: movl %eax, %ebp 3223; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3224; AVX2-NEXT: callq __truncdfhf2 3225; AVX2-NEXT: movw %ax, 4(%rbx) 3226; AVX2-NEXT: movw %bp, (%rbx) 3227; AVX2-NEXT: movw %r15w, 6(%rbx) 3228; AVX2-NEXT: movw %r14w, 2(%rbx) 3229; AVX2-NEXT: addq $88, %rsp 3230; AVX2-NEXT: popq %rbx 3231; AVX2-NEXT: popq %r14 3232; AVX2-NEXT: popq %r15 3233; AVX2-NEXT: popq %rbp 3234; AVX2-NEXT: retq 3235; 3236; AVX512-LABEL: store_cvt_4f64_to_4i16: 3237; AVX512: # BB#0: 3238; AVX512-NEXT: pushq %rbp 3239; AVX512-NEXT: .Ltmp32: 3240; AVX512-NEXT: .cfi_def_cfa_offset 16 3241; AVX512-NEXT: pushq %r15 3242; AVX512-NEXT: .Ltmp33: 3243; AVX512-NEXT: .cfi_def_cfa_offset 24 3244; AVX512-NEXT: pushq %r14 3245; AVX512-NEXT: .Ltmp34: 3246; AVX512-NEXT: .cfi_def_cfa_offset 32 3247; AVX512-NEXT: pushq %rbx 3248; AVX512-NEXT: .Ltmp35: 3249; AVX512-NEXT: .cfi_def_cfa_offset 40 3250; AVX512-NEXT: subq $88, %rsp 3251; AVX512-NEXT: .Ltmp36: 3252; AVX512-NEXT: .cfi_def_cfa_offset 128 3253; AVX512-NEXT: .Ltmp37: 3254; AVX512-NEXT: .cfi_offset %rbx, -40 3255; AVX512-NEXT: .Ltmp38: 3256; AVX512-NEXT: .cfi_offset %r14, -32 3257; AVX512-NEXT: .Ltmp39: 3258; AVX512-NEXT: .cfi_offset %r15, -24 3259; AVX512-NEXT: .Ltmp40: 3260; AVX512-NEXT: .cfi_offset %rbp, -16 3261; AVX512-NEXT: movq %rdi, %rbx 3262; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3263; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3264; AVX512-NEXT: callq __truncdfhf2 3265; AVX512-NEXT: movl %eax, %r14d 3266; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3267; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3268; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3269; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3270; AVX512-NEXT: callq __truncdfhf2 3271; AVX512-NEXT: movl %eax, %r15d 3272; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3273; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3274; AVX512-NEXT: callq __truncdfhf2 3275; AVX512-NEXT: movl %eax, %ebp 3276; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3277; AVX512-NEXT: callq __truncdfhf2 3278; AVX512-NEXT: movw %ax, 4(%rbx) 3279; AVX512-NEXT: movw %bp, (%rbx) 3280; AVX512-NEXT: movw %r15w, 6(%rbx) 3281; AVX512-NEXT: movw %r14w, 2(%rbx) 3282; AVX512-NEXT: addq $88, %rsp 3283; AVX512-NEXT: popq %rbx 3284; AVX512-NEXT: popq %r14 3285; AVX512-NEXT: popq %r15 3286; AVX512-NEXT: popq %rbp 3287; AVX512-NEXT: retq 3288 %1 = fptrunc <4 x double> %a0 to <4 x half> 3289 %2 = bitcast <4 x half> %1 to <4 x i16> 3290 store <4 x i16> %2, <4 x i16>* %a1 3291 ret void 3292} 3293 3294define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) { 3295; AVX1-LABEL: store_cvt_4f64_to_8i16_undef: 3296; AVX1: # BB#0: 3297; AVX1-NEXT: pushq %rbp 3298; AVX1-NEXT: .Ltmp41: 3299; AVX1-NEXT: .cfi_def_cfa_offset 16 3300; AVX1-NEXT: pushq %r14 3301; AVX1-NEXT: .Ltmp42: 3302; AVX1-NEXT: .cfi_def_cfa_offset 24 3303; AVX1-NEXT: pushq %rbx 3304; AVX1-NEXT: .Ltmp43: 3305; AVX1-NEXT: .cfi_def_cfa_offset 32 3306; AVX1-NEXT: subq $32, %rsp 3307; AVX1-NEXT: .Ltmp44: 3308; AVX1-NEXT: .cfi_def_cfa_offset 64 3309; AVX1-NEXT: .Ltmp45: 3310; AVX1-NEXT: .cfi_offset %rbx, -32 3311; AVX1-NEXT: .Ltmp46: 3312; AVX1-NEXT: .cfi_offset %r14, -24 3313; AVX1-NEXT: .Ltmp47: 3314; AVX1-NEXT: .cfi_offset %rbp, -16 3315; AVX1-NEXT: movq %rdi, %r14 3316; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3317; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3318; AVX1-NEXT: vzeroupper 3319; AVX1-NEXT: callq __truncdfhf2 3320; AVX1-NEXT: movw %ax, %bp 3321; AVX1-NEXT: shll $16, %ebp 3322; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3323; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3324; AVX1-NEXT: vzeroupper 3325; AVX1-NEXT: callq __truncdfhf2 3326; AVX1-NEXT: movzwl %ax, %ebx 3327; AVX1-NEXT: orl %ebp, %ebx 3328; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3329; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3330; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3331; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3332; AVX1-NEXT: vzeroupper 3333; AVX1-NEXT: callq __truncdfhf2 3334; AVX1-NEXT: movw %ax, %bp 3335; AVX1-NEXT: shll $16, %ebp 3336; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3337; AVX1-NEXT: callq __truncdfhf2 3338; AVX1-NEXT: movzwl %ax, %eax 3339; AVX1-NEXT: orl %ebp, %eax 3340; AVX1-NEXT: shlq $32, %rax 3341; AVX1-NEXT: orq %rbx, %rax 3342; AVX1-NEXT: vmovq %rax, %xmm0 3343; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3344; AVX1-NEXT: vmovdqa %xmm0, (%r14) 3345; AVX1-NEXT: addq $32, %rsp 3346; AVX1-NEXT: popq %rbx 3347; AVX1-NEXT: popq %r14 3348; AVX1-NEXT: popq %rbp 3349; AVX1-NEXT: retq 3350; 3351; AVX2-LABEL: store_cvt_4f64_to_8i16_undef: 3352; AVX2: # BB#0: 3353; AVX2-NEXT: pushq %rbp 3354; AVX2-NEXT: .Ltmp41: 3355; AVX2-NEXT: .cfi_def_cfa_offset 16 3356; AVX2-NEXT: pushq %r14 3357; AVX2-NEXT: .Ltmp42: 3358; AVX2-NEXT: .cfi_def_cfa_offset 24 3359; AVX2-NEXT: pushq %rbx 3360; AVX2-NEXT: .Ltmp43: 3361; AVX2-NEXT: .cfi_def_cfa_offset 32 3362; AVX2-NEXT: subq $32, %rsp 3363; AVX2-NEXT: .Ltmp44: 3364; AVX2-NEXT: .cfi_def_cfa_offset 64 3365; AVX2-NEXT: .Ltmp45: 3366; AVX2-NEXT: .cfi_offset %rbx, -32 3367; AVX2-NEXT: .Ltmp46: 3368; AVX2-NEXT: .cfi_offset %r14, -24 3369; AVX2-NEXT: .Ltmp47: 3370; AVX2-NEXT: .cfi_offset %rbp, -16 3371; AVX2-NEXT: movq %rdi, %r14 3372; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3373; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3374; AVX2-NEXT: vzeroupper 3375; AVX2-NEXT: callq __truncdfhf2 3376; AVX2-NEXT: movw %ax, %bp 3377; AVX2-NEXT: shll $16, %ebp 3378; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3379; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3380; AVX2-NEXT: vzeroupper 3381; AVX2-NEXT: callq __truncdfhf2 3382; AVX2-NEXT: movzwl %ax, %ebx 3383; AVX2-NEXT: orl %ebp, %ebx 3384; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3385; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3386; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3387; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3388; AVX2-NEXT: vzeroupper 3389; AVX2-NEXT: callq __truncdfhf2 3390; AVX2-NEXT: movw %ax, %bp 3391; AVX2-NEXT: shll $16, %ebp 3392; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3393; AVX2-NEXT: callq __truncdfhf2 3394; AVX2-NEXT: movzwl %ax, %eax 3395; AVX2-NEXT: orl %ebp, %eax 3396; AVX2-NEXT: shlq $32, %rax 3397; AVX2-NEXT: orq %rbx, %rax 3398; AVX2-NEXT: vmovq %rax, %xmm0 3399; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3400; AVX2-NEXT: vmovdqa %xmm0, (%r14) 3401; AVX2-NEXT: addq $32, %rsp 3402; AVX2-NEXT: popq %rbx 3403; AVX2-NEXT: popq %r14 3404; AVX2-NEXT: popq %rbp 3405; AVX2-NEXT: retq 3406; 3407; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: 3408; AVX512: # BB#0: 3409; AVX512-NEXT: pushq %rbp 3410; AVX512-NEXT: .Ltmp41: 3411; AVX512-NEXT: .cfi_def_cfa_offset 16 3412; AVX512-NEXT: pushq %r14 3413; AVX512-NEXT: .Ltmp42: 3414; AVX512-NEXT: .cfi_def_cfa_offset 24 3415; AVX512-NEXT: pushq %rbx 3416; AVX512-NEXT: .Ltmp43: 3417; AVX512-NEXT: .cfi_def_cfa_offset 32 3418; AVX512-NEXT: subq $32, %rsp 3419; AVX512-NEXT: .Ltmp44: 3420; AVX512-NEXT: .cfi_def_cfa_offset 64 3421; AVX512-NEXT: .Ltmp45: 3422; AVX512-NEXT: .cfi_offset %rbx, -32 3423; AVX512-NEXT: .Ltmp46: 3424; AVX512-NEXT: .cfi_offset %r14, -24 3425; AVX512-NEXT: .Ltmp47: 3426; AVX512-NEXT: .cfi_offset %rbp, -16 3427; AVX512-NEXT: movq %rdi, %r14 3428; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3429; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3430; AVX512-NEXT: callq __truncdfhf2 3431; AVX512-NEXT: movw %ax, %bp 3432; AVX512-NEXT: shll $16, %ebp 3433; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3434; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3435; AVX512-NEXT: callq __truncdfhf2 3436; AVX512-NEXT: movzwl %ax, %ebx 3437; AVX512-NEXT: orl %ebp, %ebx 3438; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3439; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3440; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3441; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3442; AVX512-NEXT: callq __truncdfhf2 3443; AVX512-NEXT: movw %ax, %bp 3444; AVX512-NEXT: shll $16, %ebp 3445; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3446; AVX512-NEXT: callq __truncdfhf2 3447; AVX512-NEXT: movzwl %ax, %eax 3448; AVX512-NEXT: orl %ebp, %eax 3449; AVX512-NEXT: shlq $32, %rax 3450; AVX512-NEXT: orq %rbx, %rax 3451; AVX512-NEXT: vmovq %rax, %xmm0 3452; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 3453; AVX512-NEXT: vmovdqa %xmm0, (%r14) 3454; AVX512-NEXT: addq $32, %rsp 3455; AVX512-NEXT: popq %rbx 3456; AVX512-NEXT: popq %r14 3457; AVX512-NEXT: popq %rbp 3458; AVX512-NEXT: retq 3459 %1 = fptrunc <4 x double> %a0 to <4 x half> 3460 %2 = bitcast <4 x half> %1 to <4 x i16> 3461 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3462 store <8 x i16> %3, <8 x i16>* %a1 3463 ret void 3464} 3465 3466define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) { 3467; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: 3468; AVX1: # BB#0: 3469; AVX1-NEXT: pushq %rbp 3470; AVX1-NEXT: .Ltmp48: 3471; AVX1-NEXT: .cfi_def_cfa_offset 16 3472; AVX1-NEXT: pushq %r14 3473; AVX1-NEXT: .Ltmp49: 3474; AVX1-NEXT: .cfi_def_cfa_offset 24 3475; AVX1-NEXT: pushq %rbx 3476; AVX1-NEXT: .Ltmp50: 3477; AVX1-NEXT: .cfi_def_cfa_offset 32 3478; AVX1-NEXT: subq $32, %rsp 3479; AVX1-NEXT: .Ltmp51: 3480; AVX1-NEXT: .cfi_def_cfa_offset 64 3481; AVX1-NEXT: .Ltmp52: 3482; AVX1-NEXT: .cfi_offset %rbx, -32 3483; AVX1-NEXT: .Ltmp53: 3484; AVX1-NEXT: .cfi_offset %r14, -24 3485; AVX1-NEXT: .Ltmp54: 3486; AVX1-NEXT: .cfi_offset %rbp, -16 3487; AVX1-NEXT: movq %rdi, %r14 3488; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3489; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3490; AVX1-NEXT: vzeroupper 3491; AVX1-NEXT: callq __truncdfhf2 3492; AVX1-NEXT: movw %ax, %bp 3493; AVX1-NEXT: shll $16, %ebp 3494; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3495; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3496; AVX1-NEXT: vzeroupper 3497; AVX1-NEXT: callq __truncdfhf2 3498; AVX1-NEXT: movzwl %ax, %ebx 3499; AVX1-NEXT: orl %ebp, %ebx 3500; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3501; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3502; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3503; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3504; AVX1-NEXT: vzeroupper 3505; AVX1-NEXT: callq __truncdfhf2 3506; AVX1-NEXT: movw %ax, %bp 3507; AVX1-NEXT: shll $16, %ebp 3508; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3509; AVX1-NEXT: callq __truncdfhf2 3510; AVX1-NEXT: movzwl %ax, %eax 3511; AVX1-NEXT: orl %ebp, %eax 3512; AVX1-NEXT: shlq $32, %rax 3513; AVX1-NEXT: orq %rbx, %rax 3514; AVX1-NEXT: vmovq %rax, %xmm0 3515; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 3516; AVX1-NEXT: vmovdqa %xmm0, (%r14) 3517; AVX1-NEXT: addq $32, %rsp 3518; AVX1-NEXT: popq %rbx 3519; AVX1-NEXT: popq %r14 3520; AVX1-NEXT: popq %rbp 3521; AVX1-NEXT: retq 3522; 3523; AVX2-LABEL: store_cvt_4f64_to_8i16_zero: 3524; AVX2: # BB#0: 3525; AVX2-NEXT: pushq %rbp 3526; AVX2-NEXT: .Ltmp48: 3527; AVX2-NEXT: .cfi_def_cfa_offset 16 3528; AVX2-NEXT: pushq %r14 3529; AVX2-NEXT: .Ltmp49: 3530; AVX2-NEXT: .cfi_def_cfa_offset 24 3531; AVX2-NEXT: pushq %rbx 3532; AVX2-NEXT: .Ltmp50: 3533; AVX2-NEXT: .cfi_def_cfa_offset 32 3534; AVX2-NEXT: subq $32, %rsp 3535; AVX2-NEXT: .Ltmp51: 3536; AVX2-NEXT: .cfi_def_cfa_offset 64 3537; AVX2-NEXT: .Ltmp52: 3538; AVX2-NEXT: .cfi_offset %rbx, -32 3539; AVX2-NEXT: .Ltmp53: 3540; AVX2-NEXT: .cfi_offset %r14, -24 3541; AVX2-NEXT: .Ltmp54: 3542; AVX2-NEXT: .cfi_offset %rbp, -16 3543; AVX2-NEXT: movq %rdi, %r14 3544; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3545; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3546; AVX2-NEXT: vzeroupper 3547; AVX2-NEXT: callq __truncdfhf2 3548; AVX2-NEXT: movw %ax, %bp 3549; AVX2-NEXT: shll $16, %ebp 3550; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3551; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3552; AVX2-NEXT: vzeroupper 3553; AVX2-NEXT: callq __truncdfhf2 3554; AVX2-NEXT: movzwl %ax, %ebx 3555; AVX2-NEXT: orl %ebp, %ebx 3556; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3557; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3558; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3559; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3560; AVX2-NEXT: vzeroupper 3561; AVX2-NEXT: callq __truncdfhf2 3562; AVX2-NEXT: movw %ax, %bp 3563; AVX2-NEXT: shll $16, %ebp 3564; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3565; AVX2-NEXT: callq __truncdfhf2 3566; AVX2-NEXT: movzwl %ax, %eax 3567; AVX2-NEXT: orl %ebp, %eax 3568; AVX2-NEXT: shlq $32, %rax 3569; AVX2-NEXT: orq %rbx, %rax 3570; AVX2-NEXT: vmovq %rax, %xmm0 3571; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 3572; AVX2-NEXT: vmovdqa %xmm0, (%r14) 3573; AVX2-NEXT: addq $32, %rsp 3574; AVX2-NEXT: popq %rbx 3575; AVX2-NEXT: popq %r14 3576; AVX2-NEXT: popq %rbp 3577; AVX2-NEXT: retq 3578; 3579; AVX512-LABEL: store_cvt_4f64_to_8i16_zero: 3580; AVX512: # BB#0: 3581; AVX512-NEXT: pushq %rbp 3582; AVX512-NEXT: .Ltmp48: 3583; AVX512-NEXT: .cfi_def_cfa_offset 16 3584; AVX512-NEXT: pushq %r14 3585; AVX512-NEXT: .Ltmp49: 3586; AVX512-NEXT: .cfi_def_cfa_offset 24 3587; AVX512-NEXT: pushq %rbx 3588; AVX512-NEXT: .Ltmp50: 3589; AVX512-NEXT: .cfi_def_cfa_offset 32 3590; AVX512-NEXT: subq $32, %rsp 3591; AVX512-NEXT: .Ltmp51: 3592; AVX512-NEXT: .cfi_def_cfa_offset 64 3593; AVX512-NEXT: .Ltmp52: 3594; AVX512-NEXT: .cfi_offset %rbx, -32 3595; AVX512-NEXT: .Ltmp53: 3596; AVX512-NEXT: .cfi_offset %r14, -24 3597; AVX512-NEXT: .Ltmp54: 3598; AVX512-NEXT: .cfi_offset %rbp, -16 3599; AVX512-NEXT: movq %rdi, %r14 3600; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill 3601; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3602; AVX512-NEXT: callq __truncdfhf2 3603; AVX512-NEXT: movw %ax, %bp 3604; AVX512-NEXT: shll $16, %ebp 3605; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3606; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3607; AVX512-NEXT: callq __truncdfhf2 3608; AVX512-NEXT: movzwl %ax, %ebx 3609; AVX512-NEXT: orl %ebp, %ebx 3610; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload 3611; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3612; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill 3613; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3614; AVX512-NEXT: callq __truncdfhf2 3615; AVX512-NEXT: movw %ax, %bp 3616; AVX512-NEXT: shll $16, %ebp 3617; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 3618; AVX512-NEXT: callq __truncdfhf2 3619; AVX512-NEXT: movzwl %ax, %eax 3620; AVX512-NEXT: orl %ebp, %eax 3621; AVX512-NEXT: shlq $32, %rax 3622; AVX512-NEXT: orq %rbx, %rax 3623; AVX512-NEXT: vmovq %rax, %xmm0 3624; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero 3625; AVX512-NEXT: vmovdqa %xmm0, (%r14) 3626; AVX512-NEXT: addq $32, %rsp 3627; AVX512-NEXT: popq %rbx 3628; AVX512-NEXT: popq %r14 3629; AVX512-NEXT: popq %rbp 3630; AVX512-NEXT: retq 3631 %1 = fptrunc <4 x double> %a0 to <4 x half> 3632 %2 = bitcast <4 x half> %1 to <4 x i16> 3633 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3634 store <8 x i16> %3, <8 x i16>* %a1 3635 ret void 3636} 3637 3638define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) { 3639; AVX1-LABEL: store_cvt_8f64_to_8i16: 3640; AVX1: # BB#0: 3641; AVX1-NEXT: pushq %rbp 3642; AVX1-NEXT: .Ltmp55: 3643; AVX1-NEXT: .cfi_def_cfa_offset 16 3644; AVX1-NEXT: pushq %r15 3645; AVX1-NEXT: .Ltmp56: 3646; AVX1-NEXT: .cfi_def_cfa_offset 24 3647; AVX1-NEXT: pushq %r14 3648; AVX1-NEXT: .Ltmp57: 3649; AVX1-NEXT: .cfi_def_cfa_offset 32 3650; AVX1-NEXT: pushq %r13 3651; AVX1-NEXT: .Ltmp58: 3652; AVX1-NEXT: .cfi_def_cfa_offset 40 3653; AVX1-NEXT: pushq %r12 3654; AVX1-NEXT: .Ltmp59: 3655; AVX1-NEXT: .cfi_def_cfa_offset 48 3656; AVX1-NEXT: pushq %rbx 3657; AVX1-NEXT: .Ltmp60: 3658; AVX1-NEXT: .cfi_def_cfa_offset 56 3659; AVX1-NEXT: subq $136, %rsp 3660; AVX1-NEXT: .Ltmp61: 3661; AVX1-NEXT: .cfi_def_cfa_offset 192 3662; AVX1-NEXT: .Ltmp62: 3663; AVX1-NEXT: .cfi_offset %rbx, -56 3664; AVX1-NEXT: .Ltmp63: 3665; AVX1-NEXT: .cfi_offset %r12, -48 3666; AVX1-NEXT: .Ltmp64: 3667; AVX1-NEXT: .cfi_offset %r13, -40 3668; AVX1-NEXT: .Ltmp65: 3669; AVX1-NEXT: .cfi_offset %r14, -32 3670; AVX1-NEXT: .Ltmp66: 3671; AVX1-NEXT: .cfi_offset %r15, -24 3672; AVX1-NEXT: .Ltmp67: 3673; AVX1-NEXT: .cfi_offset %rbp, -16 3674; AVX1-NEXT: movq %rdi, %rbx 3675; AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill 3676; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3677; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3678; AVX1-NEXT: vzeroupper 3679; AVX1-NEXT: callq __truncdfhf2 3680; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3681; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3682; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3683; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3684; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3685; AVX1-NEXT: vzeroupper 3686; AVX1-NEXT: callq __truncdfhf2 3687; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3688; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3689; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3690; AVX1-NEXT: vzeroupper 3691; AVX1-NEXT: callq __truncdfhf2 3692; AVX1-NEXT: movl %eax, %r12d 3693; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3694; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3695; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3696; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3697; AVX1-NEXT: vzeroupper 3698; AVX1-NEXT: callq __truncdfhf2 3699; AVX1-NEXT: movl %eax, %r13d 3700; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3701; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3702; AVX1-NEXT: vzeroupper 3703; AVX1-NEXT: callq __truncdfhf2 3704; AVX1-NEXT: movl %eax, %ebp 3705; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3706; AVX1-NEXT: callq __truncdfhf2 3707; AVX1-NEXT: movl %eax, %r14d 3708; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3709; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3710; AVX1-NEXT: vzeroupper 3711; AVX1-NEXT: callq __truncdfhf2 3712; AVX1-NEXT: movl %eax, %r15d 3713; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3714; AVX1-NEXT: callq __truncdfhf2 3715; AVX1-NEXT: movw %ax, 12(%rbx) 3716; AVX1-NEXT: movw %r15w, 8(%rbx) 3717; AVX1-NEXT: movw %r14w, 4(%rbx) 3718; AVX1-NEXT: movw %bp, (%rbx) 3719; AVX1-NEXT: movw %r13w, 14(%rbx) 3720; AVX1-NEXT: movw %r12w, 10(%rbx) 3721; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3722; AVX1-NEXT: movw %ax, 6(%rbx) 3723; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3724; AVX1-NEXT: movw %ax, 2(%rbx) 3725; AVX1-NEXT: addq $136, %rsp 3726; AVX1-NEXT: popq %rbx 3727; AVX1-NEXT: popq %r12 3728; AVX1-NEXT: popq %r13 3729; AVX1-NEXT: popq %r14 3730; AVX1-NEXT: popq %r15 3731; AVX1-NEXT: popq %rbp 3732; AVX1-NEXT: retq 3733; 3734; AVX2-LABEL: store_cvt_8f64_to_8i16: 3735; AVX2: # BB#0: 3736; AVX2-NEXT: pushq %rbp 3737; AVX2-NEXT: .Ltmp55: 3738; AVX2-NEXT: .cfi_def_cfa_offset 16 3739; AVX2-NEXT: pushq %r15 3740; AVX2-NEXT: .Ltmp56: 3741; AVX2-NEXT: .cfi_def_cfa_offset 24 3742; AVX2-NEXT: pushq %r14 3743; AVX2-NEXT: .Ltmp57: 3744; AVX2-NEXT: .cfi_def_cfa_offset 32 3745; AVX2-NEXT: pushq %r13 3746; AVX2-NEXT: .Ltmp58: 3747; AVX2-NEXT: .cfi_def_cfa_offset 40 3748; AVX2-NEXT: pushq %r12 3749; AVX2-NEXT: .Ltmp59: 3750; AVX2-NEXT: .cfi_def_cfa_offset 48 3751; AVX2-NEXT: pushq %rbx 3752; AVX2-NEXT: .Ltmp60: 3753; AVX2-NEXT: .cfi_def_cfa_offset 56 3754; AVX2-NEXT: subq $136, %rsp 3755; AVX2-NEXT: .Ltmp61: 3756; AVX2-NEXT: .cfi_def_cfa_offset 192 3757; AVX2-NEXT: .Ltmp62: 3758; AVX2-NEXT: .cfi_offset %rbx, -56 3759; AVX2-NEXT: .Ltmp63: 3760; AVX2-NEXT: .cfi_offset %r12, -48 3761; AVX2-NEXT: .Ltmp64: 3762; AVX2-NEXT: .cfi_offset %r13, -40 3763; AVX2-NEXT: .Ltmp65: 3764; AVX2-NEXT: .cfi_offset %r14, -32 3765; AVX2-NEXT: .Ltmp66: 3766; AVX2-NEXT: .cfi_offset %r15, -24 3767; AVX2-NEXT: .Ltmp67: 3768; AVX2-NEXT: .cfi_offset %rbp, -16 3769; AVX2-NEXT: movq %rdi, %rbx 3770; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill 3771; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3772; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3773; AVX2-NEXT: vzeroupper 3774; AVX2-NEXT: callq __truncdfhf2 3775; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3776; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3777; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3778; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3779; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3780; AVX2-NEXT: vzeroupper 3781; AVX2-NEXT: callq __truncdfhf2 3782; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3783; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3784; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3785; AVX2-NEXT: vzeroupper 3786; AVX2-NEXT: callq __truncdfhf2 3787; AVX2-NEXT: movl %eax, %r12d 3788; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3789; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 3790; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3791; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3792; AVX2-NEXT: vzeroupper 3793; AVX2-NEXT: callq __truncdfhf2 3794; AVX2-NEXT: movl %eax, %r13d 3795; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3796; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3797; AVX2-NEXT: vzeroupper 3798; AVX2-NEXT: callq __truncdfhf2 3799; AVX2-NEXT: movl %eax, %ebp 3800; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3801; AVX2-NEXT: callq __truncdfhf2 3802; AVX2-NEXT: movl %eax, %r14d 3803; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3804; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3805; AVX2-NEXT: vzeroupper 3806; AVX2-NEXT: callq __truncdfhf2 3807; AVX2-NEXT: movl %eax, %r15d 3808; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3809; AVX2-NEXT: callq __truncdfhf2 3810; AVX2-NEXT: movw %ax, 12(%rbx) 3811; AVX2-NEXT: movw %r15w, 8(%rbx) 3812; AVX2-NEXT: movw %r14w, 4(%rbx) 3813; AVX2-NEXT: movw %bp, (%rbx) 3814; AVX2-NEXT: movw %r13w, 14(%rbx) 3815; AVX2-NEXT: movw %r12w, 10(%rbx) 3816; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3817; AVX2-NEXT: movw %ax, 6(%rbx) 3818; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3819; AVX2-NEXT: movw %ax, 2(%rbx) 3820; AVX2-NEXT: addq $136, %rsp 3821; AVX2-NEXT: popq %rbx 3822; AVX2-NEXT: popq %r12 3823; AVX2-NEXT: popq %r13 3824; AVX2-NEXT: popq %r14 3825; AVX2-NEXT: popq %r15 3826; AVX2-NEXT: popq %rbp 3827; AVX2-NEXT: retq 3828; 3829; AVX512-LABEL: store_cvt_8f64_to_8i16: 3830; AVX512: # BB#0: 3831; AVX512-NEXT: pushq %rbp 3832; AVX512-NEXT: .Ltmp55: 3833; AVX512-NEXT: .cfi_def_cfa_offset 16 3834; AVX512-NEXT: pushq %r15 3835; AVX512-NEXT: .Ltmp56: 3836; AVX512-NEXT: .cfi_def_cfa_offset 24 3837; AVX512-NEXT: pushq %r14 3838; AVX512-NEXT: .Ltmp57: 3839; AVX512-NEXT: .cfi_def_cfa_offset 32 3840; AVX512-NEXT: pushq %r13 3841; AVX512-NEXT: .Ltmp58: 3842; AVX512-NEXT: .cfi_def_cfa_offset 40 3843; AVX512-NEXT: pushq %r12 3844; AVX512-NEXT: .Ltmp59: 3845; AVX512-NEXT: .cfi_def_cfa_offset 48 3846; AVX512-NEXT: pushq %rbx 3847; AVX512-NEXT: .Ltmp60: 3848; AVX512-NEXT: .cfi_def_cfa_offset 56 3849; AVX512-NEXT: subq $200, %rsp 3850; AVX512-NEXT: .Ltmp61: 3851; AVX512-NEXT: .cfi_def_cfa_offset 256 3852; AVX512-NEXT: .Ltmp62: 3853; AVX512-NEXT: .cfi_offset %rbx, -56 3854; AVX512-NEXT: .Ltmp63: 3855; AVX512-NEXT: .cfi_offset %r12, -48 3856; AVX512-NEXT: .Ltmp64: 3857; AVX512-NEXT: .cfi_offset %r13, -40 3858; AVX512-NEXT: .Ltmp65: 3859; AVX512-NEXT: .cfi_offset %r14, -32 3860; AVX512-NEXT: .Ltmp66: 3861; AVX512-NEXT: .cfi_offset %r15, -24 3862; AVX512-NEXT: .Ltmp67: 3863; AVX512-NEXT: .cfi_offset %rbp, -16 3864; AVX512-NEXT: movq %rdi, %rbx 3865; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill 3866; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3867; AVX512-NEXT: callq __truncdfhf2 3868; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3869; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload 3870; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3871; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3872; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3873; AVX512-NEXT: callq __truncdfhf2 3874; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill 3875; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload 3876; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 3877; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 3878; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3879; AVX512-NEXT: callq __truncdfhf2 3880; AVX512-NEXT: movl %eax, %r12d 3881; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3882; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 3883; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill 3884; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 3885; AVX512-NEXT: callq __truncdfhf2 3886; AVX512-NEXT: movl %eax, %r13d 3887; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload 3888; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> 3889; AVX512-NEXT: callq __truncdfhf2 3890; AVX512-NEXT: movl %eax, %ebp 3891; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3892; AVX512-NEXT: callq __truncdfhf2 3893; AVX512-NEXT: movl %eax, %r14d 3894; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 3895; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 3896; AVX512-NEXT: callq __truncdfhf2 3897; AVX512-NEXT: movl %eax, %r15d 3898; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload 3899; AVX512-NEXT: callq __truncdfhf2 3900; AVX512-NEXT: movw %ax, 12(%rbx) 3901; AVX512-NEXT: movw %r15w, 8(%rbx) 3902; AVX512-NEXT: movw %r14w, 4(%rbx) 3903; AVX512-NEXT: movw %bp, (%rbx) 3904; AVX512-NEXT: movw %r13w, 14(%rbx) 3905; AVX512-NEXT: movw %r12w, 10(%rbx) 3906; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3907; AVX512-NEXT: movw %ax, 6(%rbx) 3908; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload 3909; AVX512-NEXT: movw %ax, 2(%rbx) 3910; AVX512-NEXT: addq $200, %rsp 3911; AVX512-NEXT: popq %rbx 3912; AVX512-NEXT: popq %r12 3913; AVX512-NEXT: popq %r13 3914; AVX512-NEXT: popq %r14 3915; AVX512-NEXT: popq %r15 3916; AVX512-NEXT: popq %rbp 3917; AVX512-NEXT: retq 3918 %1 = fptrunc <8 x double> %a0 to <8 x half> 3919 %2 = bitcast <8 x half> %1 to <8 x i16> 3920 store <8 x i16> %2, <8 x i16>* %a1 3921 ret void 3922} 3923