1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4; 5; 32-bit tests to make sure we're not doing anything stupid. 6; RUN: llc < %s -mtriple=i686-unknown-unknown 7; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse 8; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 9 10; 11; Signed Integer to Double 12; 13 14define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { 15; SSE-LABEL: sitofp_2i64_to_2f64: 16; SSE: # BB#0: 17; SSE-NEXT: movd %xmm0, %rax 18; SSE-NEXT: cvtsi2sdq %rax, %xmm1 19; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 20; SSE-NEXT: movd %xmm0, %rax 21; SSE-NEXT: xorps %xmm0, %xmm0 22; SSE-NEXT: cvtsi2sdq %rax, %xmm0 23; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 24; SSE-NEXT: movapd %xmm1, %xmm0 25; SSE-NEXT: retq 26; 27; AVX-LABEL: sitofp_2i64_to_2f64: 28; AVX: # BB#0: 29; AVX-NEXT: vpextrq $1, %xmm0, %rax 30; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 31; AVX-NEXT: vmovq %xmm0, %rax 32; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 33; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 34; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 35; AVX-NEXT: retq 36 %cvt = sitofp <2 x i64> %a to <2 x double> 37 ret <2 x double> %cvt 38} 39 40define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) { 41; SSE-LABEL: sitofp_2i32_to_2f64: 42; SSE: # BB#0: 43; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 44; SSE-NEXT: retq 45; 46; AVX-LABEL: sitofp_2i32_to_2f64: 47; AVX: # BB#0: 48; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 49; AVX-NEXT: retq 50 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 51 %cvt = sitofp <2 x i32> %shuf to <2 x double> 52 ret <2 x double> %cvt 53} 54 55define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) { 56; SSE-LABEL: sitofp_4i32_to_2f64: 57; SSE: # BB#0: 58; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 59; SSE-NEXT: retq 60; 61; AVX-LABEL: sitofp_4i32_to_2f64: 62; AVX: # BB#0: 63; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 64; AVX-NEXT: vzeroupper 65; AVX-NEXT: retq 66 %cvt = sitofp <4 x i32> %a to <4 x double> 67 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1> 68 ret <2 x double> %shuf 69} 70 71define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) { 72; SSE-LABEL: sitofp_2i16_to_2f64: 73; SSE: # BB#0: 74; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 75; SSE-NEXT: psrad $16, %xmm0 76; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 77; SSE-NEXT: retq 78; 79; AVX-LABEL: sitofp_2i16_to_2f64: 80; AVX: # BB#0: 81; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 82; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 83; AVX-NEXT: retq 84 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 85 %cvt = sitofp <2 x i16> %shuf to <2 x double> 86 ret <2 x double> %cvt 87} 88 89define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { 90; SSE-LABEL: sitofp_8i16_to_2f64: 91; SSE: # BB#0: 92; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 93; SSE-NEXT: psrad $16, %xmm0 94; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 95; SSE-NEXT: retq 96; 97; AVX1-LABEL: sitofp_8i16_to_2f64: 98; AVX1: # BB#0: 99; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 100; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 101; AVX1-NEXT: vzeroupper 102; AVX1-NEXT: retq 103; 104; AVX2-LABEL: sitofp_8i16_to_2f64: 105; AVX2: # BB#0: 106; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 107; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 108; AVX2-NEXT: vzeroupper 109; AVX2-NEXT: retq 110 %cvt = sitofp <8 x i16> %a to <8 x double> 111 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1> 112 ret <2 x double> %shuf 113} 114 115define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) { 116; SSE-LABEL: sitofp_2i8_to_2f64: 117; SSE: # BB#0: 118; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 119; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 120; SSE-NEXT: psrad $24, %xmm0 121; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 122; SSE-NEXT: retq 123; 124; AVX-LABEL: sitofp_2i8_to_2f64: 125; AVX: # BB#0: 126; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 127; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 128; AVX-NEXT: retq 129 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 130 %cvt = sitofp <2 x i8> %shuf to <2 x double> 131 ret <2 x double> %cvt 132} 133 134define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { 135; SSE-LABEL: sitofp_16i8_to_2f64: 136; SSE: # BB#0: 137; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 138; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 139; SSE-NEXT: psrad $24, %xmm0 140; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 141; SSE-NEXT: retq 142; 143; AVX1-LABEL: sitofp_16i8_to_2f64: 144; AVX1: # BB#0: 145; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 146; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 147; AVX1-NEXT: vzeroupper 148; AVX1-NEXT: retq 149; 150; AVX2-LABEL: sitofp_16i8_to_2f64: 151; AVX2: # BB#0: 152; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 153; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 154; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 155; AVX2-NEXT: vzeroupper 156; AVX2-NEXT: retq 157 %cvt = sitofp <16 x i8> %a to <16 x double> 158 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1> 159 ret <2 x double> %shuf 160} 161 162define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { 163; SSE-LABEL: sitofp_4i64_to_4f64: 164; SSE: # BB#0: 165; SSE-NEXT: movd %xmm0, %rax 166; SSE-NEXT: cvtsi2sdq %rax, %xmm2 167; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 168; SSE-NEXT: movd %xmm0, %rax 169; SSE-NEXT: xorps %xmm0, %xmm0 170; SSE-NEXT: cvtsi2sdq %rax, %xmm0 171; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] 172; SSE-NEXT: movd %xmm1, %rax 173; SSE-NEXT: cvtsi2sdq %rax, %xmm3 174; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 175; SSE-NEXT: movd %xmm0, %rax 176; SSE-NEXT: xorps %xmm0, %xmm0 177; SSE-NEXT: cvtsi2sdq %rax, %xmm0 178; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] 179; SSE-NEXT: movapd %xmm2, %xmm0 180; SSE-NEXT: movapd %xmm3, %xmm1 181; SSE-NEXT: retq 182; 183; AVX1-LABEL: sitofp_4i64_to_4f64: 184; AVX1: # BB#0: 185; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 186; AVX1-NEXT: vpextrq $1, %xmm1, %rax 187; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 188; AVX1-NEXT: vmovq %xmm1, %rax 189; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 190; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 191; AVX1-NEXT: vpextrq $1, %xmm0, %rax 192; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 193; AVX1-NEXT: vmovq %xmm0, %rax 194; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 195; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 196; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 197; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 198; AVX1-NEXT: retq 199; 200; AVX2-LABEL: sitofp_4i64_to_4f64: 201; AVX2: # BB#0: 202; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 203; AVX2-NEXT: vpextrq $1, %xmm1, %rax 204; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 205; AVX2-NEXT: vmovq %xmm1, %rax 206; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 207; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 208; AVX2-NEXT: vpextrq $1, %xmm0, %rax 209; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 210; AVX2-NEXT: vmovq %xmm0, %rax 211; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 212; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 213; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 214; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 215; AVX2-NEXT: retq 216 %cvt = sitofp <4 x i64> %a to <4 x double> 217 ret <4 x double> %cvt 218} 219 220define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) { 221; SSE-LABEL: sitofp_4i32_to_4f64: 222; SSE: # BB#0: 223; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 224; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 225; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 226; SSE-NEXT: movaps %xmm2, %xmm0 227; SSE-NEXT: retq 228; 229; AVX-LABEL: sitofp_4i32_to_4f64: 230; AVX: # BB#0: 231; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 232; AVX-NEXT: retq 233 %cvt = sitofp <4 x i32> %a to <4 x double> 234 ret <4 x double> %cvt 235} 236 237define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { 238; SSE-LABEL: sitofp_4i16_to_4f64: 239; SSE: # BB#0: 240; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 241; SSE-NEXT: psrad $16, %xmm1 242; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 243; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 244; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 245; SSE-NEXT: retq 246; 247; AVX-LABEL: sitofp_4i16_to_4f64: 248; AVX: # BB#0: 249; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 250; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 251; AVX-NEXT: retq 252 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 253 %cvt = sitofp <4 x i16> %shuf to <4 x double> 254 ret <4 x double> %cvt 255} 256 257define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { 258; SSE-LABEL: sitofp_8i16_to_4f64: 259; SSE: # BB#0: 260; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 261; SSE-NEXT: psrad $16, %xmm1 262; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 263; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 264; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 265; SSE-NEXT: retq 266; 267; AVX1-LABEL: sitofp_8i16_to_4f64: 268; AVX1: # BB#0: 269; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 270; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 271; AVX1-NEXT: retq 272; 273; AVX2-LABEL: sitofp_8i16_to_4f64: 274; AVX2: # BB#0: 275; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 276; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 277; AVX2-NEXT: retq 278 %cvt = sitofp <8 x i16> %a to <8 x double> 279 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 280 ret <4 x double> %shuf 281} 282 283define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { 284; SSE-LABEL: sitofp_4i8_to_4f64: 285; SSE: # BB#0: 286; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 287; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 288; SSE-NEXT: psrad $24, %xmm1 289; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 290; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 291; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 292; SSE-NEXT: retq 293; 294; AVX-LABEL: sitofp_4i8_to_4f64: 295; AVX: # BB#0: 296; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 297; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 298; AVX-NEXT: retq 299 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 300 %cvt = sitofp <4 x i8> %shuf to <4 x double> 301 ret <4 x double> %cvt 302} 303 304define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { 305; SSE-LABEL: sitofp_16i8_to_4f64: 306; SSE: # BB#0: 307; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 308; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 309; SSE-NEXT: psrad $24, %xmm1 310; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 311; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 312; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 313; SSE-NEXT: retq 314; 315; AVX1-LABEL: sitofp_16i8_to_4f64: 316; AVX1: # BB#0: 317; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 318; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 319; AVX1-NEXT: retq 320; 321; AVX2-LABEL: sitofp_16i8_to_4f64: 322; AVX2: # BB#0: 323; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 324; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 325; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 326; AVX2-NEXT: retq 327 %cvt = sitofp <16 x i8> %a to <16 x double> 328 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 329 ret <4 x double> %shuf 330} 331 332; 333; Unsigned Integer to Double 334; 335 336define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { 337; SSE-LABEL: uitofp_2i64_to_2f64: 338; SSE: # BB#0: 339; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 340; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 341; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 342; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 343; SSE-NEXT: subpd %xmm3, %xmm0 344; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 345; SSE-NEXT: addpd %xmm4, %xmm0 346; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 347; SSE-NEXT: subpd %xmm3, %xmm2 348; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 349; SSE-NEXT: addpd %xmm2, %xmm1 350; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 351; SSE-NEXT: retq 352; 353; AVX-LABEL: uitofp_2i64_to_2f64: 354; AVX: # BB#0: 355; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 356; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 357; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 358; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 359; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 360; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 361; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 362; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 363; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 364; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 365; AVX-NEXT: retq 366 %cvt = uitofp <2 x i64> %a to <2 x double> 367 ret <2 x double> %cvt 368} 369 370define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { 371; SSE-LABEL: uitofp_2i32_to_2f64: 372; SSE: # BB#0: 373; SSE-NEXT: pxor %xmm1, %xmm1 374; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 375; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 376; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 377; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 378; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 379; SSE-NEXT: subpd %xmm3, %xmm0 380; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 381; SSE-NEXT: addpd %xmm4, %xmm0 382; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 383; SSE-NEXT: subpd %xmm3, %xmm2 384; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 385; SSE-NEXT: addpd %xmm2, %xmm1 386; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 387; SSE-NEXT: retq 388; 389; AVX-LABEL: uitofp_2i32_to_2f64: 390; AVX: # BB#0: 391; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 392; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 393; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 394; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 395; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 396; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 397; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 398; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 399; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 400; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 401; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 402; AVX-NEXT: retq 403 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 404 %cvt = uitofp <2 x i32> %shuf to <2 x double> 405 ret <2 x double> %cvt 406} 407 408define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { 409; SSE-LABEL: uitofp_4i32_to_2f64: 410; SSE: # BB#0: 411; SSE-NEXT: pxor %xmm1, %xmm1 412; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 413; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 414; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 415; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 416; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 417; SSE-NEXT: subpd %xmm3, %xmm0 418; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 419; SSE-NEXT: addpd %xmm4, %xmm0 420; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 421; SSE-NEXT: subpd %xmm3, %xmm2 422; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 423; SSE-NEXT: addpd %xmm2, %xmm1 424; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 425; SSE-NEXT: retq 426; 427; AVX1-LABEL: uitofp_4i32_to_2f64: 428; AVX1: # BB#0: 429; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 430; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 431; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 432; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 433; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 434; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 435; AVX1-NEXT: vzeroupper 436; AVX1-NEXT: retq 437; 438; AVX2-LABEL: uitofp_4i32_to_2f64: 439; AVX2: # BB#0: 440; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 441; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 442; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 443; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 444; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 445; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 446; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 447; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 448; AVX2-NEXT: vzeroupper 449; AVX2-NEXT: retq 450 %cvt = uitofp <4 x i32> %a to <4 x double> 451 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1> 452 ret <2 x double> %shuf 453} 454 455define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) { 456; SSE-LABEL: uitofp_2i16_to_2f64: 457; SSE: # BB#0: 458; SSE-NEXT: pxor %xmm1, %xmm1 459; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 460; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 461; SSE-NEXT: retq 462; 463; AVX-LABEL: uitofp_2i16_to_2f64: 464; AVX: # BB#0: 465; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 466; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 467; AVX-NEXT: retq 468 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 469 %cvt = uitofp <2 x i16> %shuf to <2 x double> 470 ret <2 x double> %cvt 471} 472 473define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) { 474; SSE-LABEL: uitofp_8i16_to_2f64: 475; SSE: # BB#0: 476; SSE-NEXT: pxor %xmm1, %xmm1 477; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 478; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 479; SSE-NEXT: retq 480; 481; AVX1-LABEL: uitofp_8i16_to_2f64: 482; AVX1: # BB#0: 483; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 484; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 485; AVX1-NEXT: vzeroupper 486; AVX1-NEXT: retq 487; 488; AVX2-LABEL: uitofp_8i16_to_2f64: 489; AVX2: # BB#0: 490; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 491; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 492; AVX2-NEXT: vzeroupper 493; AVX2-NEXT: retq 494 %cvt = uitofp <8 x i16> %a to <8 x double> 495 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1> 496 ret <2 x double> %shuf 497} 498 499define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) { 500; SSE-LABEL: uitofp_2i8_to_2f64: 501; SSE: # BB#0: 502; SSE-NEXT: pxor %xmm1, %xmm1 503; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 504; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 505; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 506; SSE-NEXT: retq 507; 508; AVX-LABEL: uitofp_2i8_to_2f64: 509; AVX: # BB#0: 510; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 511; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 512; AVX-NEXT: retq 513 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 514 %cvt = uitofp <2 x i8> %shuf to <2 x double> 515 ret <2 x double> %cvt 516} 517 518define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { 519; SSE-LABEL: uitofp_16i8_to_2f64: 520; SSE: # BB#0: 521; SSE-NEXT: pxor %xmm1, %xmm1 522; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 523; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 524; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 525; SSE-NEXT: retq 526; 527; AVX1-LABEL: uitofp_16i8_to_2f64: 528; AVX1: # BB#0: 529; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 530; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 531; AVX1-NEXT: vzeroupper 532; AVX1-NEXT: retq 533; 534; AVX2-LABEL: uitofp_16i8_to_2f64: 535; AVX2: # BB#0: 536; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 537; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 538; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 539; AVX2-NEXT: vzeroupper 540; AVX2-NEXT: retq 541 %cvt = uitofp <16 x i8> %a to <16 x double> 542 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1> 543 ret <2 x double> %shuf 544} 545 546define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { 547; SSE-LABEL: uitofp_4i64_to_4f64: 548; SSE: # BB#0: 549; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 550; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 551; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 552; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 553; SSE-NEXT: subpd %xmm4, %xmm0 554; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] 555; SSE-NEXT: addpd %xmm5, %xmm0 556; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 557; SSE-NEXT: subpd %xmm4, %xmm3 558; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] 559; SSE-NEXT: addpd %xmm3, %xmm5 560; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] 561; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 562; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 563; SSE-NEXT: subpd %xmm4, %xmm1 564; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] 565; SSE-NEXT: addpd %xmm5, %xmm1 566; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 567; SSE-NEXT: subpd %xmm4, %xmm3 568; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] 569; SSE-NEXT: addpd %xmm3, %xmm2 570; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 571; SSE-NEXT: retq 572; 573; AVX1-LABEL: uitofp_4i64_to_4f64: 574; AVX1: # BB#0: 575; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 576; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 577; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 578; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 579; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 580; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 581; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 582; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 583; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 584; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 585; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] 586; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 587; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 588; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 589; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 590; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 591; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 592; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 593; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] 594; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 595; AVX1-NEXT: retq 596; 597; AVX2-LABEL: uitofp_4i64_to_4f64: 598; AVX2: # BB#0: 599; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 600; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 601; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 602; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 603; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 604; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 605; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 606; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 607; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1 608; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 609; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] 610; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 611; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 612; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 613; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 614; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 615; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0 616; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 617; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] 618; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 619; AVX2-NEXT: retq 620 %cvt = uitofp <4 x i64> %a to <4 x double> 621 ret <4 x double> %cvt 622} 623 624define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { 625; SSE-LABEL: uitofp_4i32_to_4f64: 626; SSE: # BB#0: 627; SSE-NEXT: movdqa %xmm0, %xmm2 628; SSE-NEXT: pxor %xmm1, %xmm1 629; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 630; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] 631; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 632; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 633; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25] 634; SSE-NEXT: subpd %xmm5, %xmm0 635; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] 636; SSE-NEXT: addpd %xmm6, %xmm0 637; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 638; SSE-NEXT: subpd %xmm5, %xmm4 639; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] 640; SSE-NEXT: addpd %xmm4, %xmm6 641; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0] 642; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 643; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] 644; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 645; SSE-NEXT: subpd %xmm5, %xmm2 646; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 647; SSE-NEXT: addpd %xmm2, %xmm1 648; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 649; SSE-NEXT: subpd %xmm5, %xmm4 650; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] 651; SSE-NEXT: addpd %xmm4, %xmm2 652; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 653; SSE-NEXT: retq 654; 655; AVX1-LABEL: uitofp_4i32_to_4f64: 656; AVX1: # BB#0: 657; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 658; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 659; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 660; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 661; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 662; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 663; AVX1-NEXT: retq 664; 665; AVX2-LABEL: uitofp_4i32_to_4f64: 666; AVX2: # BB#0: 667; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 668; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 669; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 670; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 671; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 672; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 673; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 674; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 675; AVX2-NEXT: retq 676 %cvt = uitofp <4 x i32> %a to <4 x double> 677 ret <4 x double> %cvt 678} 679 680define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { 681; SSE-LABEL: uitofp_4i16_to_4f64: 682; SSE: # BB#0: 683; SSE-NEXT: pxor %xmm1, %xmm1 684; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 685; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 686; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 687; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 688; SSE-NEXT: movaps %xmm2, %xmm0 689; SSE-NEXT: retq 690; 691; AVX-LABEL: uitofp_4i16_to_4f64: 692; AVX: # BB#0: 693; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 694; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 695; AVX-NEXT: retq 696 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 697 %cvt = uitofp <4 x i16> %shuf to <4 x double> 698 ret <4 x double> %cvt 699} 700 701define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { 702; SSE-LABEL: uitofp_8i16_to_4f64: 703; SSE: # BB#0: 704; SSE-NEXT: pxor %xmm1, %xmm1 705; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 706; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 707; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 708; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 709; SSE-NEXT: movaps %xmm2, %xmm0 710; SSE-NEXT: retq 711; 712; AVX1-LABEL: uitofp_8i16_to_4f64: 713; AVX1: # BB#0: 714; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 715; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 716; AVX1-NEXT: retq 717; 718; AVX2-LABEL: uitofp_8i16_to_4f64: 719; AVX2: # BB#0: 720; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 721; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 722; AVX2-NEXT: retq 723 %cvt = uitofp <8 x i16> %a to <8 x double> 724 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 725 ret <4 x double> %shuf 726} 727 728define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { 729; SSE-LABEL: uitofp_4i8_to_4f64: 730; SSE: # BB#0: 731; SSE-NEXT: pxor %xmm1, %xmm1 732; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 733; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 734; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 735; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 736; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 737; SSE-NEXT: movaps %xmm2, %xmm0 738; SSE-NEXT: retq 739; 740; AVX-LABEL: uitofp_4i8_to_4f64: 741; AVX: # BB#0: 742; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 743; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 744; AVX-NEXT: retq 745 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 746 %cvt = uitofp <4 x i8> %shuf to <4 x double> 747 ret <4 x double> %cvt 748} 749 750define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { 751; SSE-LABEL: uitofp_16i8_to_4f64: 752; SSE: # BB#0: 753; SSE-NEXT: pxor %xmm1, %xmm1 754; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 755; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 756; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 757; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 758; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 759; SSE-NEXT: movaps %xmm2, %xmm0 760; SSE-NEXT: retq 761; 762; AVX1-LABEL: uitofp_16i8_to_4f64: 763; AVX1: # BB#0: 764; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 765; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 766; AVX1-NEXT: retq 767; 768; AVX2-LABEL: uitofp_16i8_to_4f64: 769; AVX2: # BB#0: 770; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 771; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 772; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 773; AVX2-NEXT: retq 774 %cvt = uitofp <16 x i8> %a to <16 x double> 775 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 776 ret <4 x double> %shuf 777} 778 779; 780; Signed Integer to Float 781; 782 783define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { 784; SSE-LABEL: sitofp_2i64_to_4f32: 785; SSE: # BB#0: 786; SSE-NEXT: movd %xmm0, %rax 787; SSE-NEXT: cvtsi2ssq %rax, %xmm1 788; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 789; SSE-NEXT: movd %xmm0, %rax 790; SSE-NEXT: xorps %xmm0, %xmm0 791; SSE-NEXT: cvtsi2ssq %rax, %xmm0 792; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 793; SSE-NEXT: movaps %xmm1, %xmm0 794; SSE-NEXT: retq 795; 796; AVX-LABEL: sitofp_2i64_to_4f32: 797; AVX: # BB#0: 798; AVX-NEXT: vpextrq $1, %xmm0, %rax 799; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 800; AVX-NEXT: vmovq %xmm0, %rax 801; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 802; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 803; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 804; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 805; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 806; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 807; AVX-NEXT: retq 808 %cvt = sitofp <2 x i64> %a to <2 x float> 809 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 810 ret <4 x float> %ext 811} 812 813define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { 814; SSE-LABEL: sitofp_4i64_to_4f32_undef: 815; SSE: # BB#0: 816; SSE-NEXT: cvtsi2ssq %rax, %xmm2 817; SSE-NEXT: movd %xmm0, %rax 818; SSE-NEXT: cvtsi2ssq %rax, %xmm1 819; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 820; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 821; SSE-NEXT: movd %xmm0, %rax 822; SSE-NEXT: xorps %xmm0, %xmm0 823; SSE-NEXT: cvtsi2ssq %rax, %xmm0 824; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 825; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 826; SSE-NEXT: movaps %xmm1, %xmm0 827; SSE-NEXT: retq 828; 829; AVX-LABEL: sitofp_4i64_to_4f32_undef: 830; AVX: # BB#0: 831; AVX-NEXT: vpextrq $1, %xmm0, %rax 832; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 833; AVX-NEXT: vmovq %xmm0, %rax 834; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 835; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 836; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 837; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 838; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 839; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 840; AVX-NEXT: retq 841 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 842 %cvt = sitofp <4 x i64> %ext to <4 x float> 843 ret <4 x float> %cvt 844} 845 846define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) { 847; SSE-LABEL: sitofp_4i32_to_4f32: 848; SSE: # BB#0: 849; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 850; SSE-NEXT: retq 851; 852; AVX-LABEL: sitofp_4i32_to_4f32: 853; AVX: # BB#0: 854; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 855; AVX-NEXT: retq 856 %cvt = sitofp <4 x i32> %a to <4 x float> 857 ret <4 x float> %cvt 858} 859 860define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) { 861; SSE-LABEL: sitofp_4i16_to_4f32: 862; SSE: # BB#0: 863; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 864; SSE-NEXT: psrad $16, %xmm0 865; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 866; SSE-NEXT: retq 867; 868; AVX-LABEL: sitofp_4i16_to_4f32: 869; AVX: # BB#0: 870; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 871; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 872; AVX-NEXT: retq 873 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 874 %cvt = sitofp <4 x i16> %shuf to <4 x float> 875 ret <4 x float> %cvt 876} 877 878define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { 879; SSE-LABEL: sitofp_8i16_to_4f32: 880; SSE: # BB#0: 881; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 882; SSE-NEXT: psrad $16, %xmm0 883; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 884; SSE-NEXT: retq 885; 886; AVX1-LABEL: sitofp_8i16_to_4f32: 887; AVX1: # BB#0: 888; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 889; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 890; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 891; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 892; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 893; AVX1-NEXT: vzeroupper 894; AVX1-NEXT: retq 895; 896; AVX2-LABEL: sitofp_8i16_to_4f32: 897; AVX2: # BB#0: 898; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 899; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 900; AVX2-NEXT: vzeroupper 901; AVX2-NEXT: retq 902 %cvt = sitofp <8 x i16> %a to <8 x float> 903 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 904 ret <4 x float> %shuf 905} 906 907define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) { 908; SSE-LABEL: sitofp_4i8_to_4f32: 909; SSE: # BB#0: 910; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 911; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 912; SSE-NEXT: psrad $24, %xmm0 913; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 914; SSE-NEXT: retq 915; 916; AVX-LABEL: sitofp_4i8_to_4f32: 917; AVX: # BB#0: 918; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 919; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 920; AVX-NEXT: retq 921 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 922 %cvt = sitofp <4 x i8> %shuf to <4 x float> 923 ret <4 x float> %cvt 924} 925 926define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { 927; SSE-LABEL: sitofp_16i8_to_4f32: 928; SSE: # BB#0: 929; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 930; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 931; SSE-NEXT: psrad $24, %xmm0 932; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 933; SSE-NEXT: retq 934; 935; AVX1-LABEL: sitofp_16i8_to_4f32: 936; AVX1: # BB#0: 937; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 938; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 939; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 940; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 941; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 942; AVX1-NEXT: vzeroupper 943; AVX1-NEXT: retq 944; 945; AVX2-LABEL: sitofp_16i8_to_4f32: 946; AVX2: # BB#0: 947; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 948; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 949; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 950; AVX2-NEXT: vzeroupper 951; AVX2-NEXT: retq 952 %cvt = sitofp <16 x i8> %a to <16 x float> 953 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 954 ret <4 x float> %shuf 955} 956 957define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { 958; SSE-LABEL: sitofp_4i64_to_4f32: 959; SSE: # BB#0: 960; SSE-NEXT: movd %xmm1, %rax 961; SSE-NEXT: cvtsi2ssq %rax, %xmm3 962; SSE-NEXT: movd %xmm0, %rax 963; SSE-NEXT: cvtsi2ssq %rax, %xmm2 964; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 965; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 966; SSE-NEXT: movd %xmm1, %rax 967; SSE-NEXT: xorps %xmm1, %xmm1 968; SSE-NEXT: cvtsi2ssq %rax, %xmm1 969; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 970; SSE-NEXT: movd %xmm0, %rax 971; SSE-NEXT: xorps %xmm0, %xmm0 972; SSE-NEXT: cvtsi2ssq %rax, %xmm0 973; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 974; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 975; SSE-NEXT: movaps %xmm2, %xmm0 976; SSE-NEXT: retq 977; 978; AVX1-LABEL: sitofp_4i64_to_4f32: 979; AVX1: # BB#0: 980; AVX1-NEXT: vpextrq $1, %xmm0, %rax 981; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 982; AVX1-NEXT: vmovq %xmm0, %rax 983; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 984; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 985; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 986; AVX1-NEXT: vmovq %xmm0, %rax 987; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 988; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 989; AVX1-NEXT: vpextrq $1, %xmm0, %rax 990; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 991; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 992; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 993; AVX1-NEXT: vzeroupper 994; AVX1-NEXT: retq 995; 996; AVX2-LABEL: sitofp_4i64_to_4f32: 997; AVX2: # BB#0: 998; AVX2-NEXT: vpextrq $1, %xmm0, %rax 999; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1000; AVX2-NEXT: vmovq %xmm0, %rax 1001; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1002; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1003; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1004; AVX2-NEXT: vmovq %xmm0, %rax 1005; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1006; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1007; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1008; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 1009; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1010; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1011; AVX2-NEXT: vzeroupper 1012; AVX2-NEXT: retq 1013 %cvt = sitofp <4 x i64> %a to <4 x float> 1014 ret <4 x float> %cvt 1015} 1016 1017define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) { 1018; SSE-LABEL: sitofp_8i32_to_8f32: 1019; SSE: # BB#0: 1020; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1021; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 1022; SSE-NEXT: retq 1023; 1024; AVX-LABEL: sitofp_8i32_to_8f32: 1025; AVX: # BB#0: 1026; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 1027; AVX-NEXT: retq 1028 %cvt = sitofp <8 x i32> %a to <8 x float> 1029 ret <8 x float> %cvt 1030} 1031 1032define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { 1033; SSE-LABEL: sitofp_8i16_to_8f32: 1034; SSE: # BB#0: 1035; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1036; SSE-NEXT: psrad $16, %xmm1 1037; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 1038; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1039; SSE-NEXT: psrad $16, %xmm0 1040; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1041; SSE-NEXT: movaps %xmm2, %xmm0 1042; SSE-NEXT: retq 1043; 1044; AVX1-LABEL: sitofp_8i16_to_8f32: 1045; AVX1: # BB#0: 1046; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 1047; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1048; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 1049; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1050; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1051; AVX1-NEXT: retq 1052; 1053; AVX2-LABEL: sitofp_8i16_to_8f32: 1054; AVX2: # BB#0: 1055; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1056; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1057; AVX2-NEXT: retq 1058 %cvt = sitofp <8 x i16> %a to <8 x float> 1059 ret <8 x float> %cvt 1060} 1061 1062define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) { 1063; SSE-LABEL: sitofp_8i8_to_8f32: 1064; SSE: # BB#0: 1065; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1066; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1067; SSE-NEXT: psrad $24, %xmm1 1068; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 1069; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1070; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1071; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1072; SSE-NEXT: psrad $24, %xmm0 1073; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1074; SSE-NEXT: movaps %xmm2, %xmm0 1075; SSE-NEXT: retq 1076; 1077; AVX1-LABEL: sitofp_8i8_to_8f32: 1078; AVX1: # BB#0: 1079; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 1080; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1081; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 1082; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1083; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1084; AVX1-NEXT: retq 1085; 1086; AVX2-LABEL: sitofp_8i8_to_8f32: 1087; AVX2: # BB#0: 1088; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1089; AVX2-NEXT: vpslld $24, %ymm0, %ymm0 1090; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 1091; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1092; AVX2-NEXT: retq 1093 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1094 %cvt = sitofp <8 x i8> %shuf to <8 x float> 1095 ret <8 x float> %cvt 1096} 1097 1098define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { 1099; SSE-LABEL: sitofp_16i8_to_8f32: 1100; SSE: # BB#0: 1101; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1102; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1103; SSE-NEXT: psrad $24, %xmm1 1104; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 1105; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1106; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1107; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1108; SSE-NEXT: psrad $24, %xmm0 1109; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1110; SSE-NEXT: movaps %xmm2, %xmm0 1111; SSE-NEXT: retq 1112; 1113; AVX1-LABEL: sitofp_16i8_to_8f32: 1114; AVX1: # BB#0: 1115; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 1116; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1117; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 1118; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1119; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1120; AVX1-NEXT: retq 1121; 1122; AVX2-LABEL: sitofp_16i8_to_8f32: 1123; AVX2: # BB#0: 1124; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1125; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1126; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1127; AVX2-NEXT: retq 1128 %cvt = sitofp <16 x i8> %a to <16 x float> 1129 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1130 ret <8 x float> %shuf 1131} 1132 1133; 1134; Unsigned Integer to Float 1135; 1136 1137define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { 1138; SSE-LABEL: uitofp_2i64_to_4f32: 1139; SSE: # BB#0: 1140; SSE-NEXT: movdqa %xmm0, %xmm1 1141; SSE-NEXT: movd %xmm1, %rax 1142; SSE-NEXT: movl %eax, %ecx 1143; SSE-NEXT: andl $1, %ecx 1144; SSE-NEXT: testq %rax, %rax 1145; SSE-NEXT: js .LBB38_1 1146; SSE-NEXT: # BB#2: 1147; SSE-NEXT: xorps %xmm0, %xmm0 1148; SSE-NEXT: cvtsi2ssq %rax, %xmm0 1149; SSE-NEXT: jmp .LBB38_3 1150; SSE-NEXT: .LBB38_1: 1151; SSE-NEXT: shrq %rax 1152; SSE-NEXT: orq %rax, %rcx 1153; SSE-NEXT: xorps %xmm0, %xmm0 1154; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 1155; SSE-NEXT: addss %xmm0, %xmm0 1156; SSE-NEXT: .LBB38_3: 1157; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1158; SSE-NEXT: movd %xmm1, %rax 1159; SSE-NEXT: movl %eax, %ecx 1160; SSE-NEXT: andl $1, %ecx 1161; SSE-NEXT: testq %rax, %rax 1162; SSE-NEXT: js .LBB38_4 1163; SSE-NEXT: # BB#5: 1164; SSE-NEXT: xorps %xmm1, %xmm1 1165; SSE-NEXT: cvtsi2ssq %rax, %xmm1 1166; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1167; SSE-NEXT: retq 1168; SSE-NEXT: .LBB38_4: 1169; SSE-NEXT: shrq %rax 1170; SSE-NEXT: orq %rax, %rcx 1171; SSE-NEXT: xorps %xmm1, %xmm1 1172; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 1173; SSE-NEXT: addss %xmm1, %xmm1 1174; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1175; SSE-NEXT: retq 1176; 1177; AVX-LABEL: uitofp_2i64_to_4f32: 1178; AVX: # BB#0: 1179; AVX-NEXT: vpextrq $1, %xmm0, %rax 1180; AVX-NEXT: movl %eax, %ecx 1181; AVX-NEXT: andl $1, %ecx 1182; AVX-NEXT: testq %rax, %rax 1183; AVX-NEXT: js .LBB38_1 1184; AVX-NEXT: # BB#2: 1185; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1186; AVX-NEXT: jmp .LBB38_3 1187; AVX-NEXT: .LBB38_1: 1188; AVX-NEXT: shrq %rax 1189; AVX-NEXT: orq %rax, %rcx 1190; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1191; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 1192; AVX-NEXT: .LBB38_3: 1193; AVX-NEXT: vmovq %xmm0, %rax 1194; AVX-NEXT: movl %eax, %ecx 1195; AVX-NEXT: andl $1, %ecx 1196; AVX-NEXT: testq %rax, %rax 1197; AVX-NEXT: js .LBB38_4 1198; AVX-NEXT: # BB#5: 1199; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1200; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1201; AVX-NEXT: jmp .LBB38_6 1202; AVX-NEXT: .LBB38_4: 1203; AVX-NEXT: shrq %rax 1204; AVX-NEXT: orq %rax, %rcx 1205; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1206; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1207; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 1208; AVX-NEXT: .LBB38_6: 1209; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 1210; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1211; AVX-NEXT: testq %rax, %rax 1212; AVX-NEXT: js .LBB38_8 1213; AVX-NEXT: # BB#7: 1214; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1215; AVX-NEXT: .LBB38_8: 1216; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 1217; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 1218; AVX-NEXT: retq 1219 %cvt = uitofp <2 x i64> %a to <2 x float> 1220 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1221 ret <4 x float> %ext 1222} 1223 1224define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { 1225; SSE-LABEL: uitofp_4i64_to_4f32_undef: 1226; SSE: # BB#0: 1227; SSE-NEXT: movdqa %xmm0, %xmm1 1228; SSE-NEXT: testq %rax, %rax 1229; SSE-NEXT: xorps %xmm2, %xmm2 1230; SSE-NEXT: js .LBB39_2 1231; SSE-NEXT: # BB#1: 1232; SSE-NEXT: xorps %xmm2, %xmm2 1233; SSE-NEXT: cvtsi2ssq %rax, %xmm2 1234; SSE-NEXT: .LBB39_2: 1235; SSE-NEXT: movd %xmm1, %rax 1236; SSE-NEXT: movl %eax, %ecx 1237; SSE-NEXT: andl $1, %ecx 1238; SSE-NEXT: testq %rax, %rax 1239; SSE-NEXT: js .LBB39_3 1240; SSE-NEXT: # BB#4: 1241; SSE-NEXT: xorps %xmm0, %xmm0 1242; SSE-NEXT: cvtsi2ssq %rax, %xmm0 1243; SSE-NEXT: jmp .LBB39_5 1244; SSE-NEXT: .LBB39_3: 1245; SSE-NEXT: shrq %rax 1246; SSE-NEXT: orq %rax, %rcx 1247; SSE-NEXT: xorps %xmm0, %xmm0 1248; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 1249; SSE-NEXT: addss %xmm0, %xmm0 1250; SSE-NEXT: .LBB39_5: 1251; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1252; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1253; SSE-NEXT: movd %xmm1, %rax 1254; SSE-NEXT: movl %eax, %ecx 1255; SSE-NEXT: andl $1, %ecx 1256; SSE-NEXT: testq %rax, %rax 1257; SSE-NEXT: js .LBB39_6 1258; SSE-NEXT: # BB#7: 1259; SSE-NEXT: xorps %xmm1, %xmm1 1260; SSE-NEXT: cvtsi2ssq %rax, %xmm1 1261; SSE-NEXT: jmp .LBB39_8 1262; SSE-NEXT: .LBB39_6: 1263; SSE-NEXT: shrq %rax 1264; SSE-NEXT: orq %rax, %rcx 1265; SSE-NEXT: xorps %xmm1, %xmm1 1266; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 1267; SSE-NEXT: addss %xmm1, %xmm1 1268; SSE-NEXT: .LBB39_8: 1269; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1270; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1271; SSE-NEXT: retq 1272; 1273; AVX-LABEL: uitofp_4i64_to_4f32_undef: 1274; AVX: # BB#0: 1275; AVX-NEXT: vpextrq $1, %xmm0, %rax 1276; AVX-NEXT: movl %eax, %ecx 1277; AVX-NEXT: andl $1, %ecx 1278; AVX-NEXT: testq %rax, %rax 1279; AVX-NEXT: js .LBB39_1 1280; AVX-NEXT: # BB#2: 1281; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1282; AVX-NEXT: jmp .LBB39_3 1283; AVX-NEXT: .LBB39_1: 1284; AVX-NEXT: shrq %rax 1285; AVX-NEXT: orq %rax, %rcx 1286; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1287; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 1288; AVX-NEXT: .LBB39_3: 1289; AVX-NEXT: vmovq %xmm0, %rax 1290; AVX-NEXT: movl %eax, %ecx 1291; AVX-NEXT: andl $1, %ecx 1292; AVX-NEXT: testq %rax, %rax 1293; AVX-NEXT: js .LBB39_4 1294; AVX-NEXT: # BB#5: 1295; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1296; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1297; AVX-NEXT: jmp .LBB39_6 1298; AVX-NEXT: .LBB39_4: 1299; AVX-NEXT: shrq %rax 1300; AVX-NEXT: orq %rax, %rcx 1301; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1302; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1303; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 1304; AVX-NEXT: .LBB39_6: 1305; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 1306; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1307; AVX-NEXT: testq %rax, %rax 1308; AVX-NEXT: js .LBB39_8 1309; AVX-NEXT: # BB#7: 1310; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1311; AVX-NEXT: .LBB39_8: 1312; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 1313; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 1314; AVX-NEXT: retq 1315 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1316 %cvt = uitofp <4 x i64> %ext to <4 x float> 1317 ret <4 x float> %cvt 1318} 1319 1320define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) { 1321; SSE-LABEL: uitofp_4i32_to_4f32: 1322; SSE: # BB#0: 1323; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 1324; SSE-NEXT: pand %xmm0, %xmm1 1325; SSE-NEXT: por {{.*}}(%rip), %xmm1 1326; SSE-NEXT: psrld $16, %xmm0 1327; SSE-NEXT: por {{.*}}(%rip), %xmm0 1328; SSE-NEXT: addps {{.*}}(%rip), %xmm0 1329; SSE-NEXT: addps %xmm1, %xmm0 1330; SSE-NEXT: retq 1331; 1332; AVX1-LABEL: uitofp_4i32_to_4f32: 1333; AVX1: # BB#0: 1334; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] 1335; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1336; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] 1337; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 1338; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 1339; AVX1-NEXT: retq 1340; 1341; AVX2-LABEL: uitofp_4i32_to_4f32: 1342; AVX2: # BB#0: 1343; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 1344; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1345; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 1346; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 1347; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 1348; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 1349; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 1350; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 1351; AVX2-NEXT: retq 1352 %cvt = uitofp <4 x i32> %a to <4 x float> 1353 ret <4 x float> %cvt 1354} 1355 1356define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) { 1357; SSE-LABEL: uitofp_4i16_to_4f32: 1358; SSE: # BB#0: 1359; SSE-NEXT: pxor %xmm1, %xmm1 1360; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1361; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1362; SSE-NEXT: retq 1363; 1364; AVX-LABEL: uitofp_4i16_to_4f32: 1365; AVX: # BB#0: 1366; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1367; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 1368; AVX-NEXT: retq 1369 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1370 %cvt = uitofp <4 x i16> %shuf to <4 x float> 1371 ret <4 x float> %cvt 1372} 1373 1374define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) { 1375; SSE-LABEL: uitofp_8i16_to_4f32: 1376; SSE: # BB#0: 1377; SSE-NEXT: pxor %xmm1, %xmm1 1378; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1379; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1380; SSE-NEXT: retq 1381; 1382; AVX1-LABEL: uitofp_8i16_to_4f32: 1383; AVX1: # BB#0: 1384; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1385; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1386; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1387; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1388; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1389; AVX1-NEXT: vzeroupper 1390; AVX1-NEXT: retq 1391; 1392; AVX2-LABEL: uitofp_8i16_to_4f32: 1393; AVX2: # BB#0: 1394; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1395; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1396; AVX2-NEXT: vzeroupper 1397; AVX2-NEXT: retq 1398 %cvt = uitofp <8 x i16> %a to <8 x float> 1399 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1400 ret <4 x float> %shuf 1401} 1402 1403define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) { 1404; SSE-LABEL: uitofp_4i8_to_4f32: 1405; SSE: # BB#0: 1406; SSE-NEXT: pxor %xmm1, %xmm1 1407; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1408; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1409; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1410; SSE-NEXT: retq 1411; 1412; AVX-LABEL: uitofp_4i8_to_4f32: 1413; AVX: # BB#0: 1414; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1415; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 1416; AVX-NEXT: retq 1417 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1418 %cvt = uitofp <4 x i8> %shuf to <4 x float> 1419 ret <4 x float> %cvt 1420} 1421 1422define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { 1423; SSE-LABEL: uitofp_16i8_to_4f32: 1424; SSE: # BB#0: 1425; SSE-NEXT: pxor %xmm1, %xmm1 1426; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1427; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1428; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1429; SSE-NEXT: retq 1430; 1431; AVX1-LABEL: uitofp_16i8_to_4f32: 1432; AVX1: # BB#0: 1433; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1434; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1435; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1436; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1437; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1438; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1439; AVX1-NEXT: vzeroupper 1440; AVX1-NEXT: retq 1441; 1442; AVX2-LABEL: uitofp_16i8_to_4f32: 1443; AVX2: # BB#0: 1444; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1445; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1446; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1447; AVX2-NEXT: vzeroupper 1448; AVX2-NEXT: retq 1449 %cvt = uitofp <16 x i8> %a to <16 x float> 1450 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1451 ret <4 x float> %shuf 1452} 1453 1454define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { 1455; SSE-LABEL: uitofp_4i64_to_4f32: 1456; SSE: # BB#0: 1457; SSE-NEXT: movd %xmm1, %rax 1458; SSE-NEXT: movl %eax, %ecx 1459; SSE-NEXT: andl $1, %ecx 1460; SSE-NEXT: testq %rax, %rax 1461; SSE-NEXT: js .LBB45_1 1462; SSE-NEXT: # BB#2: 1463; SSE-NEXT: cvtsi2ssq %rax, %xmm3 1464; SSE-NEXT: jmp .LBB45_3 1465; SSE-NEXT: .LBB45_1: 1466; SSE-NEXT: shrq %rax 1467; SSE-NEXT: orq %rax, %rcx 1468; SSE-NEXT: cvtsi2ssq %rcx, %xmm3 1469; SSE-NEXT: addss %xmm3, %xmm3 1470; SSE-NEXT: .LBB45_3: 1471; SSE-NEXT: movd %xmm0, %rax 1472; SSE-NEXT: movl %eax, %ecx 1473; SSE-NEXT: andl $1, %ecx 1474; SSE-NEXT: testq %rax, %rax 1475; SSE-NEXT: js .LBB45_4 1476; SSE-NEXT: # BB#5: 1477; SSE-NEXT: cvtsi2ssq %rax, %xmm2 1478; SSE-NEXT: jmp .LBB45_6 1479; SSE-NEXT: .LBB45_4: 1480; SSE-NEXT: shrq %rax 1481; SSE-NEXT: orq %rax, %rcx 1482; SSE-NEXT: cvtsi2ssq %rcx, %xmm2 1483; SSE-NEXT: addss %xmm2, %xmm2 1484; SSE-NEXT: .LBB45_6: 1485; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1486; SSE-NEXT: movd %xmm1, %rax 1487; SSE-NEXT: movl %eax, %ecx 1488; SSE-NEXT: andl $1, %ecx 1489; SSE-NEXT: testq %rax, %rax 1490; SSE-NEXT: js .LBB45_7 1491; SSE-NEXT: # BB#8: 1492; SSE-NEXT: xorps %xmm1, %xmm1 1493; SSE-NEXT: cvtsi2ssq %rax, %xmm1 1494; SSE-NEXT: jmp .LBB45_9 1495; SSE-NEXT: .LBB45_7: 1496; SSE-NEXT: shrq %rax 1497; SSE-NEXT: orq %rax, %rcx 1498; SSE-NEXT: xorps %xmm1, %xmm1 1499; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 1500; SSE-NEXT: addss %xmm1, %xmm1 1501; SSE-NEXT: .LBB45_9: 1502; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1503; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1504; SSE-NEXT: movd %xmm0, %rax 1505; SSE-NEXT: movl %eax, %ecx 1506; SSE-NEXT: andl $1, %ecx 1507; SSE-NEXT: testq %rax, %rax 1508; SSE-NEXT: js .LBB45_10 1509; SSE-NEXT: # BB#11: 1510; SSE-NEXT: xorps %xmm0, %xmm0 1511; SSE-NEXT: cvtsi2ssq %rax, %xmm0 1512; SSE-NEXT: jmp .LBB45_12 1513; SSE-NEXT: .LBB45_10: 1514; SSE-NEXT: shrq %rax 1515; SSE-NEXT: orq %rax, %rcx 1516; SSE-NEXT: xorps %xmm0, %xmm0 1517; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 1518; SSE-NEXT: addss %xmm0, %xmm0 1519; SSE-NEXT: .LBB45_12: 1520; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1521; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1522; SSE-NEXT: movaps %xmm2, %xmm0 1523; SSE-NEXT: retq 1524; 1525; AVX1-LABEL: uitofp_4i64_to_4f32: 1526; AVX1: # BB#0: 1527; AVX1-NEXT: vpextrq $1, %xmm0, %rax 1528; AVX1-NEXT: movl %eax, %ecx 1529; AVX1-NEXT: andl $1, %ecx 1530; AVX1-NEXT: testq %rax, %rax 1531; AVX1-NEXT: js .LBB45_1 1532; AVX1-NEXT: # BB#2: 1533; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1534; AVX1-NEXT: jmp .LBB45_3 1535; AVX1-NEXT: .LBB45_1: 1536; AVX1-NEXT: shrq %rax 1537; AVX1-NEXT: orq %rax, %rcx 1538; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1539; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 1540; AVX1-NEXT: .LBB45_3: 1541; AVX1-NEXT: vmovq %xmm0, %rax 1542; AVX1-NEXT: movl %eax, %ecx 1543; AVX1-NEXT: andl $1, %ecx 1544; AVX1-NEXT: testq %rax, %rax 1545; AVX1-NEXT: js .LBB45_4 1546; AVX1-NEXT: # BB#5: 1547; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1548; AVX1-NEXT: jmp .LBB45_6 1549; AVX1-NEXT: .LBB45_4: 1550; AVX1-NEXT: shrq %rax 1551; AVX1-NEXT: orq %rax, %rcx 1552; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1553; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 1554; AVX1-NEXT: .LBB45_6: 1555; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1556; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1557; AVX1-NEXT: vmovq %xmm0, %rax 1558; AVX1-NEXT: movl %eax, %ecx 1559; AVX1-NEXT: andl $1, %ecx 1560; AVX1-NEXT: testq %rax, %rax 1561; AVX1-NEXT: js .LBB45_7 1562; AVX1-NEXT: # BB#8: 1563; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1564; AVX1-NEXT: jmp .LBB45_9 1565; AVX1-NEXT: .LBB45_7: 1566; AVX1-NEXT: shrq %rax 1567; AVX1-NEXT: orq %rax, %rcx 1568; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1569; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 1570; AVX1-NEXT: .LBB45_9: 1571; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1572; AVX1-NEXT: vpextrq $1, %xmm0, %rax 1573; AVX1-NEXT: movl %eax, %ecx 1574; AVX1-NEXT: andl $1, %ecx 1575; AVX1-NEXT: testq %rax, %rax 1576; AVX1-NEXT: js .LBB45_10 1577; AVX1-NEXT: # BB#11: 1578; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 1579; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1580; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1581; AVX1-NEXT: vzeroupper 1582; AVX1-NEXT: retq 1583; AVX1-NEXT: .LBB45_10: 1584; AVX1-NEXT: shrq %rax 1585; AVX1-NEXT: orq %rax, %rcx 1586; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1587; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 1588; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1589; AVX1-NEXT: vzeroupper 1590; AVX1-NEXT: retq 1591; 1592; AVX2-LABEL: uitofp_4i64_to_4f32: 1593; AVX2: # BB#0: 1594; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1595; AVX2-NEXT: movl %eax, %ecx 1596; AVX2-NEXT: andl $1, %ecx 1597; AVX2-NEXT: testq %rax, %rax 1598; AVX2-NEXT: js .LBB45_1 1599; AVX2-NEXT: # BB#2: 1600; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1601; AVX2-NEXT: jmp .LBB45_3 1602; AVX2-NEXT: .LBB45_1: 1603; AVX2-NEXT: shrq %rax 1604; AVX2-NEXT: orq %rax, %rcx 1605; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1606; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 1607; AVX2-NEXT: .LBB45_3: 1608; AVX2-NEXT: vmovq %xmm0, %rax 1609; AVX2-NEXT: movl %eax, %ecx 1610; AVX2-NEXT: andl $1, %ecx 1611; AVX2-NEXT: testq %rax, %rax 1612; AVX2-NEXT: js .LBB45_4 1613; AVX2-NEXT: # BB#5: 1614; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1615; AVX2-NEXT: jmp .LBB45_6 1616; AVX2-NEXT: .LBB45_4: 1617; AVX2-NEXT: shrq %rax 1618; AVX2-NEXT: orq %rax, %rcx 1619; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1620; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 1621; AVX2-NEXT: .LBB45_6: 1622; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1623; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1624; AVX2-NEXT: vmovq %xmm0, %rax 1625; AVX2-NEXT: movl %eax, %ecx 1626; AVX2-NEXT: andl $1, %ecx 1627; AVX2-NEXT: testq %rax, %rax 1628; AVX2-NEXT: js .LBB45_7 1629; AVX2-NEXT: # BB#8: 1630; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1631; AVX2-NEXT: jmp .LBB45_9 1632; AVX2-NEXT: .LBB45_7: 1633; AVX2-NEXT: shrq %rax 1634; AVX2-NEXT: orq %rax, %rcx 1635; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1636; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 1637; AVX2-NEXT: .LBB45_9: 1638; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1639; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1640; AVX2-NEXT: movl %eax, %ecx 1641; AVX2-NEXT: andl $1, %ecx 1642; AVX2-NEXT: testq %rax, %rax 1643; AVX2-NEXT: js .LBB45_10 1644; AVX2-NEXT: # BB#11: 1645; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 1646; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1647; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1648; AVX2-NEXT: vzeroupper 1649; AVX2-NEXT: retq 1650; AVX2-NEXT: .LBB45_10: 1651; AVX2-NEXT: shrq %rax 1652; AVX2-NEXT: orq %rax, %rcx 1653; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1654; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 1655; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1656; AVX2-NEXT: vzeroupper 1657; AVX2-NEXT: retq 1658 %cvt = uitofp <4 x i64> %a to <4 x float> 1659 ret <4 x float> %cvt 1660} 1661 1662define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) { 1663; SSE-LABEL: uitofp_8i32_to_8f32: 1664; SSE: # BB#0: 1665; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] 1666; SSE-NEXT: movdqa %xmm0, %xmm3 1667; SSE-NEXT: pand %xmm2, %xmm3 1668; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] 1669; SSE-NEXT: por %xmm4, %xmm3 1670; SSE-NEXT: psrld $16, %xmm0 1671; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] 1672; SSE-NEXT: por %xmm5, %xmm0 1673; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] 1674; SSE-NEXT: addps %xmm6, %xmm0 1675; SSE-NEXT: addps %xmm3, %xmm0 1676; SSE-NEXT: pand %xmm1, %xmm2 1677; SSE-NEXT: por %xmm4, %xmm2 1678; SSE-NEXT: psrld $16, %xmm1 1679; SSE-NEXT: por %xmm5, %xmm1 1680; SSE-NEXT: addps %xmm6, %xmm1 1681; SSE-NEXT: addps %xmm2, %xmm1 1682; SSE-NEXT: retq 1683; 1684; AVX1-LABEL: uitofp_8i32_to_8f32: 1685; AVX1: # BB#0: 1686; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 1687; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 1688; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 1689; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1690; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1691; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1692; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1693; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1694; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 1695; AVX1-NEXT: retq 1696; 1697; AVX2-LABEL: uitofp_8i32_to_8f32: 1698; AVX2: # BB#0: 1699; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 1700; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 1701; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 1702; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 1703; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 1704; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 1705; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 1706; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 1707; AVX2-NEXT: retq 1708 %cvt = uitofp <8 x i32> %a to <8 x float> 1709 ret <8 x float> %cvt 1710} 1711 1712define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { 1713; SSE-LABEL: uitofp_8i16_to_8f32: 1714; SSE: # BB#0: 1715; SSE-NEXT: pxor %xmm1, %xmm1 1716; SSE-NEXT: movdqa %xmm0, %xmm2 1717; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1718; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 1719; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1720; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1721; SSE-NEXT: movaps %xmm2, %xmm0 1722; SSE-NEXT: retq 1723; 1724; AVX1-LABEL: uitofp_8i16_to_8f32: 1725; AVX1: # BB#0: 1726; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1727; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1728; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1729; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1730; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1731; AVX1-NEXT: retq 1732; 1733; AVX2-LABEL: uitofp_8i16_to_8f32: 1734; AVX2: # BB#0: 1735; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1736; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1737; AVX2-NEXT: retq 1738 %cvt = uitofp <8 x i16> %a to <8 x float> 1739 ret <8 x float> %cvt 1740} 1741 1742define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { 1743; SSE-LABEL: uitofp_8i8_to_8f32: 1744; SSE: # BB#0: 1745; SSE-NEXT: pxor %xmm1, %xmm1 1746; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1747; SSE-NEXT: movdqa %xmm0, %xmm2 1748; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1749; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 1750; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1751; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1752; SSE-NEXT: movaps %xmm2, %xmm0 1753; SSE-NEXT: retq 1754; 1755; AVX1-LABEL: uitofp_8i8_to_8f32: 1756; AVX1: # BB#0: 1757; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1758; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1759; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1760; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1761; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1762; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1763; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1764; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1765; AVX1-NEXT: retq 1766; 1767; AVX2-LABEL: uitofp_8i8_to_8f32: 1768; AVX2: # BB#0: 1769; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1770; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1771; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1772; AVX2-NEXT: retq 1773 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1774 %cvt = uitofp <8 x i8> %shuf to <8 x float> 1775 ret <8 x float> %cvt 1776} 1777 1778define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { 1779; SSE-LABEL: uitofp_16i8_to_8f32: 1780; SSE: # BB#0: 1781; SSE-NEXT: pxor %xmm1, %xmm1 1782; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1783; SSE-NEXT: movdqa %xmm0, %xmm2 1784; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1785; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 1786; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1787; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1788; SSE-NEXT: movaps %xmm2, %xmm0 1789; SSE-NEXT: retq 1790; 1791; AVX1-LABEL: uitofp_16i8_to_8f32: 1792; AVX1: # BB#0: 1793; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1794; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1795; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1796; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1797; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1798; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1799; AVX1-NEXT: retq 1800; 1801; AVX2-LABEL: uitofp_16i8_to_8f32: 1802; AVX2: # BB#0: 1803; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1804; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1805; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1806; AVX2-NEXT: retq 1807 %cvt = uitofp <16 x i8> %a to <16 x float> 1808 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1809 ret <8 x float> %shuf 1810} 1811 1812; 1813; Aggregates 1814; 1815 1816%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }> 1817define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) { 1818; SSE-LABEL: aggregate_sitofp_8i16_to_8f32: 1819; SSE: # BB#0: 1820; SSE-NEXT: movq 24(%rdi), %rax 1821; SSE-NEXT: movdqu 8(%rdi), %xmm0 1822; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1823; SSE-NEXT: psrad $16, %xmm1 1824; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 1825; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1826; SSE-NEXT: psrad $16, %xmm0 1827; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1828; SSE-NEXT: movaps %xmm0, 16(%rax) 1829; SSE-NEXT: movaps %xmm1, (%rax) 1830; SSE-NEXT: retq 1831; 1832; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: 1833; AVX1: # BB#0: 1834; AVX1-NEXT: movq 24(%rdi), %rax 1835; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0 1836; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 1837; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1838; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 1839; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1840; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1841; AVX1-NEXT: vmovaps %ymm0, (%rax) 1842; AVX1-NEXT: vzeroupper 1843; AVX1-NEXT: retq 1844; 1845; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32: 1846; AVX2: # BB#0: 1847; AVX2-NEXT: movq 24(%rdi), %rax 1848; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0 1849; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1850; AVX2-NEXT: vmovaps %ymm0, (%rax) 1851; AVX2-NEXT: vzeroupper 1852; AVX2-NEXT: retq 1853 %1 = load %Arguments, %Arguments* %a0, align 1 1854 %2 = extractvalue %Arguments %1, 1 1855 %3 = extractvalue %Arguments %1, 2 1856 %4 = sitofp <8 x i16> %2 to <8 x float> 1857 store <8 x float> %4, <8 x float>* %3, align 32 1858 ret void 1859} 1860