1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4; 5; 32-bit tests to make sure we're not doing anything stupid. 6; RUN: llc < %s -mtriple=i686-unknown-unknown 7; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse 8; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 9 10; 11; Signed Integer to Double 12; 13 14define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { 15; SSE-LABEL: sitofp_2i64_to_2f64: 16; SSE: # BB#0: 17; SSE-NEXT: movd %xmm0, %rax 18; SSE-NEXT: cvtsi2sdq %rax, %xmm1 19; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 20; SSE-NEXT: movd %xmm0, %rax 21; SSE-NEXT: xorps %xmm0, %xmm0 22; SSE-NEXT: cvtsi2sdq %rax, %xmm0 23; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 24; SSE-NEXT: movapd %xmm1, %xmm0 25; SSE-NEXT: retq 26; 27; AVX-LABEL: sitofp_2i64_to_2f64: 28; AVX: # BB#0: 29; AVX-NEXT: vpextrq $1, %xmm0, %rax 30; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 31; AVX-NEXT: vmovq %xmm0, %rax 32; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 33; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 34; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 35; AVX-NEXT: retq 36 %cvt = sitofp <2 x i64> %a to <2 x double> 37 ret <2 x double> %cvt 38} 39 40define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) { 41; SSE-LABEL: sitofp_2i32_to_2f64: 42; SSE: # BB#0: 43; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 44; SSE-NEXT: retq 45; 46; AVX-LABEL: sitofp_2i32_to_2f64: 47; AVX: # BB#0: 48; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 49; AVX-NEXT: retq 50 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 51 %cvt = sitofp <2 x i32> %shuf to <2 x double> 52 ret <2 x double> %cvt 53} 54 55define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) { 56; SSE-LABEL: sitofp_4i32_to_2f64: 57; SSE: # BB#0: 58; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 59; SSE-NEXT: retq 60; 61; AVX-LABEL: sitofp_4i32_to_2f64: 62; AVX: # BB#0: 63; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 64; AVX-NEXT: # kill 65; AVX-NEXT: vzeroupper 66; AVX-NEXT: retq 67 %cvt = sitofp <4 x i32> %a to <4 x double> 68 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1> 69 ret <2 x double> %shuf 70} 71 72define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) { 73; SSE-LABEL: sitofp_2i16_to_2f64: 74; SSE: # BB#0: 75; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 76; SSE-NEXT: psrad $16, %xmm0 77; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 78; SSE-NEXT: retq 79; 80; AVX-LABEL: sitofp_2i16_to_2f64: 81; AVX: # BB#0: 82; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 83; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 84; AVX-NEXT: retq 85 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 86 %cvt = sitofp <2 x i16> %shuf to <2 x double> 87 ret <2 x double> %cvt 88} 89 90define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { 91; SSE-LABEL: sitofp_8i16_to_2f64: 92; SSE: # BB#0: 93; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 94; SSE-NEXT: psrad $16, %xmm0 95; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 96; SSE-NEXT: retq 97; 98; AVX1-LABEL: sitofp_8i16_to_2f64: 99; AVX1: # BB#0: 100; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 101; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 102; AVX1-NEXT: # kill 103; AVX1-NEXT: vzeroupper 104; AVX1-NEXT: retq 105; 106; AVX2-LABEL: sitofp_8i16_to_2f64: 107; AVX2: # BB#0: 108; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 109; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 110; AVX2-NEXT: # kill 111; AVX2-NEXT: vzeroupper 112; AVX2-NEXT: retq 113 %cvt = sitofp <8 x i16> %a to <8 x double> 114 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1> 115 ret <2 x double> %shuf 116} 117 118define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) { 119; SSE-LABEL: sitofp_2i8_to_2f64: 120; SSE: # BB#0: 121; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 122; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 123; SSE-NEXT: psrad $24, %xmm0 124; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 125; SSE-NEXT: retq 126; 127; AVX-LABEL: sitofp_2i8_to_2f64: 128; AVX: # BB#0: 129; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 130; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 131; AVX-NEXT: retq 132 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 133 %cvt = sitofp <2 x i8> %shuf to <2 x double> 134 ret <2 x double> %cvt 135} 136 137define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { 138; SSE-LABEL: sitofp_16i8_to_2f64: 139; SSE: # BB#0: 140; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 141; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 142; SSE-NEXT: psrad $24, %xmm0 143; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 144; SSE-NEXT: retq 145; 146; AVX1-LABEL: sitofp_16i8_to_2f64: 147; AVX1: # BB#0: 148; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 149; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 150; AVX1-NEXT: # kill 151; AVX1-NEXT: vzeroupper 152; AVX1-NEXT: retq 153; 154; AVX2-LABEL: sitofp_16i8_to_2f64: 155; AVX2: # BB#0: 156; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 157; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 158; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 159; AVX2-NEXT: # kill 160; AVX2-NEXT: vzeroupper 161; AVX2-NEXT: retq 162 %cvt = sitofp <16 x i8> %a to <16 x double> 163 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1> 164 ret <2 x double> %shuf 165} 166 167define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { 168; SSE-LABEL: sitofp_4i64_to_4f64: 169; SSE: # BB#0: 170; SSE-NEXT: movd %xmm0, %rax 171; SSE-NEXT: cvtsi2sdq %rax, %xmm2 172; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 173; SSE-NEXT: movd %xmm0, %rax 174; SSE-NEXT: xorps %xmm0, %xmm0 175; SSE-NEXT: cvtsi2sdq %rax, %xmm0 176; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] 177; SSE-NEXT: movd %xmm1, %rax 178; SSE-NEXT: cvtsi2sdq %rax, %xmm3 179; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 180; SSE-NEXT: movd %xmm0, %rax 181; SSE-NEXT: xorps %xmm0, %xmm0 182; SSE-NEXT: cvtsi2sdq %rax, %xmm0 183; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] 184; SSE-NEXT: movapd %xmm2, %xmm0 185; SSE-NEXT: movapd %xmm3, %xmm1 186; SSE-NEXT: retq 187; 188; AVX1-LABEL: sitofp_4i64_to_4f64: 189; AVX1: # BB#0: 190; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 191; AVX1-NEXT: vpextrq $1, %xmm1, %rax 192; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 193; AVX1-NEXT: vmovq %xmm1, %rax 194; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 195; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 196; AVX1-NEXT: vpextrq $1, %xmm0, %rax 197; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 198; AVX1-NEXT: vmovq %xmm0, %rax 199; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 200; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 201; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 202; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 203; AVX1-NEXT: retq 204; 205; AVX2-LABEL: sitofp_4i64_to_4f64: 206; AVX2: # BB#0: 207; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 208; AVX2-NEXT: vpextrq $1, %xmm1, %rax 209; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 210; AVX2-NEXT: vmovq %xmm1, %rax 211; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 212; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 213; AVX2-NEXT: vpextrq $1, %xmm0, %rax 214; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 215; AVX2-NEXT: vmovq %xmm0, %rax 216; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 217; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 218; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 219; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 220; AVX2-NEXT: retq 221 %cvt = sitofp <4 x i64> %a to <4 x double> 222 ret <4 x double> %cvt 223} 224 225define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) { 226; SSE-LABEL: sitofp_4i32_to_4f64: 227; SSE: # BB#0: 228; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 229; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 230; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 231; SSE-NEXT: movaps %xmm2, %xmm0 232; SSE-NEXT: retq 233; 234; AVX-LABEL: sitofp_4i32_to_4f64: 235; AVX: # BB#0: 236; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 237; AVX-NEXT: retq 238 %cvt = sitofp <4 x i32> %a to <4 x double> 239 ret <4 x double> %cvt 240} 241 242define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { 243; SSE-LABEL: sitofp_4i16_to_4f64: 244; SSE: # BB#0: 245; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 246; SSE-NEXT: psrad $16, %xmm1 247; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 248; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 249; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 250; SSE-NEXT: retq 251; 252; AVX-LABEL: sitofp_4i16_to_4f64: 253; AVX: # BB#0: 254; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 255; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 256; AVX-NEXT: retq 257 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 258 %cvt = sitofp <4 x i16> %shuf to <4 x double> 259 ret <4 x double> %cvt 260} 261 262define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { 263; SSE-LABEL: sitofp_8i16_to_4f64: 264; SSE: # BB#0: 265; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 266; SSE-NEXT: psrad $16, %xmm1 267; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 268; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 269; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 270; SSE-NEXT: retq 271; 272; AVX1-LABEL: sitofp_8i16_to_4f64: 273; AVX1: # BB#0: 274; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 275; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 276; AVX1-NEXT: retq 277; 278; AVX2-LABEL: sitofp_8i16_to_4f64: 279; AVX2: # BB#0: 280; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 281; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 282; AVX2-NEXT: retq 283 %cvt = sitofp <8 x i16> %a to <8 x double> 284 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 285 ret <4 x double> %shuf 286} 287 288define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { 289; SSE-LABEL: sitofp_4i8_to_4f64: 290; SSE: # BB#0: 291; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 292; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 293; SSE-NEXT: psrad $24, %xmm1 294; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 295; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 296; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 297; SSE-NEXT: retq 298; 299; AVX-LABEL: sitofp_4i8_to_4f64: 300; AVX: # BB#0: 301; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 302; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 303; AVX-NEXT: retq 304 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 305 %cvt = sitofp <4 x i8> %shuf to <4 x double> 306 ret <4 x double> %cvt 307} 308 309define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { 310; SSE-LABEL: sitofp_16i8_to_4f64: 311; SSE: # BB#0: 312; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 313; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 314; SSE-NEXT: psrad $24, %xmm1 315; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 316; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 317; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 318; SSE-NEXT: retq 319; 320; AVX1-LABEL: sitofp_16i8_to_4f64: 321; AVX1: # BB#0: 322; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 323; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 324; AVX1-NEXT: retq 325; 326; AVX2-LABEL: sitofp_16i8_to_4f64: 327; AVX2: # BB#0: 328; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 329; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 330; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 331; AVX2-NEXT: retq 332 %cvt = sitofp <16 x i8> %a to <16 x double> 333 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 334 ret <4 x double> %shuf 335} 336 337; 338; Unsigned Integer to Double 339; 340 341define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { 342; SSE-LABEL: uitofp_2i64_to_2f64: 343; SSE: # BB#0: 344; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 345; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 346; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 347; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 348; SSE-NEXT: subpd %xmm3, %xmm0 349; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 350; SSE-NEXT: addpd %xmm4, %xmm0 351; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 352; SSE-NEXT: subpd %xmm3, %xmm2 353; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 354; SSE-NEXT: addpd %xmm2, %xmm1 355; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 356; SSE-NEXT: retq 357; 358; AVX-LABEL: uitofp_2i64_to_2f64: 359; AVX: # BB#0: 360; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 361; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 362; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 363; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 364; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 365; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 366; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 367; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 368; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 369; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 370; AVX-NEXT: retq 371 %cvt = uitofp <2 x i64> %a to <2 x double> 372 ret <2 x double> %cvt 373} 374 375define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { 376; SSE-LABEL: uitofp_2i32_to_2f64: 377; SSE: # BB#0: 378; SSE-NEXT: pxor %xmm1, %xmm1 379; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 380; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 381; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 382; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 383; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 384; SSE-NEXT: subpd %xmm3, %xmm0 385; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 386; SSE-NEXT: addpd %xmm4, %xmm0 387; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 388; SSE-NEXT: subpd %xmm3, %xmm2 389; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 390; SSE-NEXT: addpd %xmm2, %xmm1 391; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 392; SSE-NEXT: retq 393; 394; AVX-LABEL: uitofp_2i32_to_2f64: 395; AVX: # BB#0: 396; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 397; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 398; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 399; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 400; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 401; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 402; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 403; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 404; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 405; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 406; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 407; AVX-NEXT: retq 408 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 409 %cvt = uitofp <2 x i32> %shuf to <2 x double> 410 ret <2 x double> %cvt 411} 412 413define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { 414; SSE-LABEL: uitofp_4i32_to_2f64: 415; SSE: # BB#0: 416; SSE-NEXT: pxor %xmm1, %xmm1 417; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 418; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 419; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 420; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 421; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 422; SSE-NEXT: subpd %xmm3, %xmm0 423; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 424; SSE-NEXT: addpd %xmm4, %xmm0 425; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 426; SSE-NEXT: subpd %xmm3, %xmm2 427; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 428; SSE-NEXT: addpd %xmm2, %xmm1 429; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 430; SSE-NEXT: retq 431; 432; AVX1-LABEL: uitofp_4i32_to_2f64: 433; AVX1: # BB#0: 434; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 435; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 436; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 437; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 438; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 439; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 440; AVX1-NEXT: # kill 441; AVX1-NEXT: vzeroupper 442; AVX1-NEXT: retq 443; 444; AVX2-LABEL: uitofp_4i32_to_2f64: 445; AVX2: # BB#0: 446; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 447; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 448; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 449; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 450; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 451; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 452; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 453; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 454; AVX2-NEXT: # kill 455; AVX2-NEXT: vzeroupper 456; AVX2-NEXT: retq 457 %cvt = uitofp <4 x i32> %a to <4 x double> 458 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1> 459 ret <2 x double> %shuf 460} 461 462define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) { 463; SSE-LABEL: uitofp_2i16_to_2f64: 464; SSE: # BB#0: 465; SSE-NEXT: pxor %xmm1, %xmm1 466; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 467; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 468; SSE-NEXT: retq 469; 470; AVX-LABEL: uitofp_2i16_to_2f64: 471; AVX: # BB#0: 472; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 473; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 474; AVX-NEXT: retq 475 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 476 %cvt = uitofp <2 x i16> %shuf to <2 x double> 477 ret <2 x double> %cvt 478} 479 480define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) { 481; SSE-LABEL: uitofp_8i16_to_2f64: 482; SSE: # BB#0: 483; SSE-NEXT: pxor %xmm1, %xmm1 484; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 485; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 486; SSE-NEXT: retq 487; 488; AVX1-LABEL: uitofp_8i16_to_2f64: 489; AVX1: # BB#0: 490; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 491; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 492; AVX1-NEXT: # kill 493; AVX1-NEXT: vzeroupper 494; AVX1-NEXT: retq 495; 496; AVX2-LABEL: uitofp_8i16_to_2f64: 497; AVX2: # BB#0: 498; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 499; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 500; AVX2-NEXT: # kill 501; AVX2-NEXT: vzeroupper 502; AVX2-NEXT: retq 503 %cvt = uitofp <8 x i16> %a to <8 x double> 504 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1> 505 ret <2 x double> %shuf 506} 507 508define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) { 509; SSE-LABEL: uitofp_2i8_to_2f64: 510; SSE: # BB#0: 511; SSE-NEXT: pxor %xmm1, %xmm1 512; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 513; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 514; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 515; SSE-NEXT: retq 516; 517; AVX-LABEL: uitofp_2i8_to_2f64: 518; AVX: # BB#0: 519; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 520; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 521; AVX-NEXT: retq 522 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 523 %cvt = uitofp <2 x i8> %shuf to <2 x double> 524 ret <2 x double> %cvt 525} 526 527define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { 528; SSE-LABEL: uitofp_16i8_to_2f64: 529; SSE: # BB#0: 530; SSE-NEXT: pxor %xmm1, %xmm1 531; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 532; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 533; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 534; SSE-NEXT: retq 535; 536; AVX1-LABEL: uitofp_16i8_to_2f64: 537; AVX1: # BB#0: 538; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 539; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 540; AVX1-NEXT: # kill 541; AVX1-NEXT: vzeroupper 542; AVX1-NEXT: retq 543; 544; AVX2-LABEL: uitofp_16i8_to_2f64: 545; AVX2: # BB#0: 546; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 547; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 548; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 549; AVX2-NEXT: # kill 550; AVX2-NEXT: vzeroupper 551; AVX2-NEXT: retq 552 %cvt = uitofp <16 x i8> %a to <16 x double> 553 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1> 554 ret <2 x double> %shuf 555} 556 557define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { 558; SSE-LABEL: uitofp_4i64_to_4f64: 559; SSE: # BB#0: 560; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 561; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 562; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 563; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 564; SSE-NEXT: subpd %xmm4, %xmm0 565; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] 566; SSE-NEXT: addpd %xmm5, %xmm0 567; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 568; SSE-NEXT: subpd %xmm4, %xmm3 569; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] 570; SSE-NEXT: addpd %xmm3, %xmm5 571; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] 572; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 573; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 574; SSE-NEXT: subpd %xmm4, %xmm1 575; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] 576; SSE-NEXT: addpd %xmm5, %xmm1 577; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 578; SSE-NEXT: subpd %xmm4, %xmm3 579; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] 580; SSE-NEXT: addpd %xmm3, %xmm2 581; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 582; SSE-NEXT: retq 583; 584; AVX1-LABEL: uitofp_4i64_to_4f64: 585; AVX1: # BB#0: 586; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 587; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 588; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 589; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 590; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 591; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 592; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 593; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 594; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 595; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 596; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] 597; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 598; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 599; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 600; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 601; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 602; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 603; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 604; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] 605; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 606; AVX1-NEXT: retq 607; 608; AVX2-LABEL: uitofp_4i64_to_4f64: 609; AVX2: # BB#0: 610; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 611; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 612; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 613; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 614; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 615; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 616; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 617; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 618; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1 619; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 620; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] 621; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 622; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 623; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 624; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 625; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 626; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0 627; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 628; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] 629; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 630; AVX2-NEXT: retq 631 %cvt = uitofp <4 x i64> %a to <4 x double> 632 ret <4 x double> %cvt 633} 634 635define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { 636; SSE-LABEL: uitofp_4i32_to_4f64: 637; SSE: # BB#0: 638; SSE-NEXT: movdqa %xmm0, %xmm2 639; SSE-NEXT: pxor %xmm1, %xmm1 640; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 641; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] 642; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 643; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 644; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25] 645; SSE-NEXT: subpd %xmm5, %xmm0 646; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] 647; SSE-NEXT: addpd %xmm6, %xmm0 648; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 649; SSE-NEXT: subpd %xmm5, %xmm4 650; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] 651; SSE-NEXT: addpd %xmm4, %xmm6 652; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0] 653; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 654; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] 655; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 656; SSE-NEXT: subpd %xmm5, %xmm2 657; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 658; SSE-NEXT: addpd %xmm2, %xmm1 659; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 660; SSE-NEXT: subpd %xmm5, %xmm4 661; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] 662; SSE-NEXT: addpd %xmm4, %xmm2 663; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 664; SSE-NEXT: retq 665; 666; AVX1-LABEL: uitofp_4i32_to_4f64: 667; AVX1: # BB#0: 668; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 669; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 670; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 671; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 672; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 673; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 674; AVX1-NEXT: retq 675; 676; AVX2-LABEL: uitofp_4i32_to_4f64: 677; AVX2: # BB#0: 678; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 679; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 680; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 681; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 682; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 683; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 684; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 685; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 686; AVX2-NEXT: retq 687 %cvt = uitofp <4 x i32> %a to <4 x double> 688 ret <4 x double> %cvt 689} 690 691define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { 692; SSE-LABEL: uitofp_4i16_to_4f64: 693; SSE: # BB#0: 694; SSE-NEXT: pxor %xmm1, %xmm1 695; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 696; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 697; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 698; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 699; SSE-NEXT: movaps %xmm2, %xmm0 700; SSE-NEXT: retq 701; 702; AVX-LABEL: uitofp_4i16_to_4f64: 703; AVX: # BB#0: 704; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 705; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 706; AVX-NEXT: retq 707 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 708 %cvt = uitofp <4 x i16> %shuf to <4 x double> 709 ret <4 x double> %cvt 710} 711 712define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { 713; SSE-LABEL: uitofp_8i16_to_4f64: 714; SSE: # BB#0: 715; SSE-NEXT: pxor %xmm1, %xmm1 716; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 717; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 718; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 719; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 720; SSE-NEXT: movaps %xmm2, %xmm0 721; SSE-NEXT: retq 722; 723; AVX1-LABEL: uitofp_8i16_to_4f64: 724; AVX1: # BB#0: 725; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 726; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 727; AVX1-NEXT: retq 728; 729; AVX2-LABEL: uitofp_8i16_to_4f64: 730; AVX2: # BB#0: 731; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 732; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 733; AVX2-NEXT: retq 734 %cvt = uitofp <8 x i16> %a to <8 x double> 735 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 736 ret <4 x double> %shuf 737} 738 739define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { 740; SSE-LABEL: uitofp_4i8_to_4f64: 741; SSE: # BB#0: 742; SSE-NEXT: pxor %xmm1, %xmm1 743; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 744; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 745; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 746; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 747; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 748; SSE-NEXT: movaps %xmm2, %xmm0 749; SSE-NEXT: retq 750; 751; AVX-LABEL: uitofp_4i8_to_4f64: 752; AVX: # BB#0: 753; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 754; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 755; AVX-NEXT: retq 756 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 757 %cvt = uitofp <4 x i8> %shuf to <4 x double> 758 ret <4 x double> %cvt 759} 760 761define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { 762; SSE-LABEL: uitofp_16i8_to_4f64: 763; SSE: # BB#0: 764; SSE-NEXT: pxor %xmm1, %xmm1 765; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 766; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 767; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 768; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 769; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 770; SSE-NEXT: movaps %xmm2, %xmm0 771; SSE-NEXT: retq 772; 773; AVX1-LABEL: uitofp_16i8_to_4f64: 774; AVX1: # BB#0: 775; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 776; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 777; AVX1-NEXT: retq 778; 779; AVX2-LABEL: uitofp_16i8_to_4f64: 780; AVX2: # BB#0: 781; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 782; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 783; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 784; AVX2-NEXT: retq 785 %cvt = uitofp <16 x i8> %a to <16 x double> 786 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 787 ret <4 x double> %shuf 788} 789 790; 791; Signed Integer to Float 792; 793 794define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { 795; SSE-LABEL: sitofp_2i64_to_4f32: 796; SSE: # BB#0: 797; SSE-NEXT: movd %xmm0, %rax 798; SSE-NEXT: cvtsi2ssq %rax, %xmm1 799; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 800; SSE-NEXT: movd %xmm0, %rax 801; SSE-NEXT: xorps %xmm0, %xmm0 802; SSE-NEXT: cvtsi2ssq %rax, %xmm0 803; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 804; SSE-NEXT: movaps %xmm1, %xmm0 805; SSE-NEXT: retq 806; 807; AVX-LABEL: sitofp_2i64_to_4f32: 808; AVX: # BB#0: 809; AVX-NEXT: vpextrq $1, %xmm0, %rax 810; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 811; AVX-NEXT: vmovq %xmm0, %rax 812; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 813; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 814; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 815; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 816; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 817; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 818; AVX-NEXT: retq 819 %cvt = sitofp <2 x i64> %a to <2 x float> 820 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 821 ret <4 x float> %ext 822} 823 824define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { 825; SSE-LABEL: sitofp_4i64_to_4f32_undef: 826; SSE: # BB#0: 827; SSE-NEXT: cvtsi2ssq %rax, %xmm2 828; SSE-NEXT: movd %xmm0, %rax 829; SSE-NEXT: cvtsi2ssq %rax, %xmm1 830; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 831; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 832; SSE-NEXT: movd %xmm0, %rax 833; SSE-NEXT: xorps %xmm0, %xmm0 834; SSE-NEXT: cvtsi2ssq %rax, %xmm0 835; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 836; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 837; SSE-NEXT: movaps %xmm1, %xmm0 838; SSE-NEXT: retq 839; 840; AVX-LABEL: sitofp_4i64_to_4f32_undef: 841; AVX: # BB#0: 842; AVX-NEXT: vpextrq $1, %xmm0, %rax 843; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 844; AVX-NEXT: vmovq %xmm0, %rax 845; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 846; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 847; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 848; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 849; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 850; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 851; AVX-NEXT: retq 852 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 853 %cvt = sitofp <4 x i64> %ext to <4 x float> 854 ret <4 x float> %cvt 855} 856 857define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) { 858; SSE-LABEL: sitofp_4i32_to_4f32: 859; SSE: # BB#0: 860; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 861; SSE-NEXT: retq 862; 863; AVX-LABEL: sitofp_4i32_to_4f32: 864; AVX: # BB#0: 865; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 866; AVX-NEXT: retq 867 %cvt = sitofp <4 x i32> %a to <4 x float> 868 ret <4 x float> %cvt 869} 870 871define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) { 872; SSE-LABEL: sitofp_4i16_to_4f32: 873; SSE: # BB#0: 874; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 875; SSE-NEXT: psrad $16, %xmm0 876; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 877; SSE-NEXT: retq 878; 879; AVX-LABEL: sitofp_4i16_to_4f32: 880; AVX: # BB#0: 881; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 882; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 883; AVX-NEXT: retq 884 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 885 %cvt = sitofp <4 x i16> %shuf to <4 x float> 886 ret <4 x float> %cvt 887} 888 889define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { 890; SSE-LABEL: sitofp_8i16_to_4f32: 891; SSE: # BB#0: 892; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 893; SSE-NEXT: psrad $16, %xmm0 894; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 895; SSE-NEXT: retq 896; 897; AVX1-LABEL: sitofp_8i16_to_4f32: 898; AVX1: # BB#0: 899; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 900; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 901; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 902; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 903; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 904; AVX1-NEXT: # kill 905; AVX1-NEXT: vzeroupper 906; AVX1-NEXT: retq 907; 908; AVX2-LABEL: sitofp_8i16_to_4f32: 909; AVX2: # BB#0: 910; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 911; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 912; AVX2-NEXT: # kill 913; AVX2-NEXT: vzeroupper 914; AVX2-NEXT: retq 915 %cvt = sitofp <8 x i16> %a to <8 x float> 916 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 917 ret <4 x float> %shuf 918} 919 920define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) { 921; SSE-LABEL: sitofp_4i8_to_4f32: 922; SSE: # BB#0: 923; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 924; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 925; SSE-NEXT: psrad $24, %xmm0 926; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 927; SSE-NEXT: retq 928; 929; AVX-LABEL: sitofp_4i8_to_4f32: 930; AVX: # BB#0: 931; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 932; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 933; AVX-NEXT: retq 934 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 935 %cvt = sitofp <4 x i8> %shuf to <4 x float> 936 ret <4 x float> %cvt 937} 938 939define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { 940; SSE-LABEL: sitofp_16i8_to_4f32: 941; SSE: # BB#0: 942; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 943; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 944; SSE-NEXT: psrad $24, %xmm0 945; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 946; SSE-NEXT: retq 947; 948; AVX1-LABEL: sitofp_16i8_to_4f32: 949; AVX1: # BB#0: 950; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 951; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 952; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 953; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 954; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 955; AVX1-NEXT: # kill 956; AVX1-NEXT: vzeroupper 957; AVX1-NEXT: retq 958; 959; AVX2-LABEL: sitofp_16i8_to_4f32: 960; AVX2: # BB#0: 961; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 962; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 963; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 964; AVX2-NEXT: # kill 965; AVX2-NEXT: vzeroupper 966; AVX2-NEXT: retq 967 %cvt = sitofp <16 x i8> %a to <16 x float> 968 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 969 ret <4 x float> %shuf 970} 971 972define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { 973; SSE-LABEL: sitofp_4i64_to_4f32: 974; SSE: # BB#0: 975; SSE-NEXT: movd %xmm1, %rax 976; SSE-NEXT: cvtsi2ssq %rax, %xmm3 977; SSE-NEXT: movd %xmm0, %rax 978; SSE-NEXT: cvtsi2ssq %rax, %xmm2 979; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 980; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 981; SSE-NEXT: movd %xmm1, %rax 982; SSE-NEXT: xorps %xmm1, %xmm1 983; SSE-NEXT: cvtsi2ssq %rax, %xmm1 984; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 985; SSE-NEXT: movd %xmm0, %rax 986; SSE-NEXT: xorps %xmm0, %xmm0 987; SSE-NEXT: cvtsi2ssq %rax, %xmm0 988; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 989; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 990; SSE-NEXT: movaps %xmm2, %xmm0 991; SSE-NEXT: retq 992; 993; AVX1-LABEL: sitofp_4i64_to_4f32: 994; AVX1: # BB#0: 995; AVX1-NEXT: vpextrq $1, %xmm0, %rax 996; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 997; AVX1-NEXT: vmovq %xmm0, %rax 998; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 999; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1000; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1001; AVX1-NEXT: vmovq %xmm0, %rax 1002; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1003; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1004; AVX1-NEXT: vpextrq $1, %xmm0, %rax 1005; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 1006; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1007; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1008; AVX1-NEXT: vzeroupper 1009; AVX1-NEXT: retq 1010; 1011; AVX2-LABEL: sitofp_4i64_to_4f32: 1012; AVX2: # BB#0: 1013; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1014; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1015; AVX2-NEXT: vmovq %xmm0, %rax 1016; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1017; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1018; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1019; AVX2-NEXT: vmovq %xmm0, %rax 1020; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1021; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1022; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1023; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 1024; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1025; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1026; AVX2-NEXT: vzeroupper 1027; AVX2-NEXT: retq 1028 %cvt = sitofp <4 x i64> %a to <4 x float> 1029 ret <4 x float> %cvt 1030} 1031 1032define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) { 1033; SSE-LABEL: sitofp_8i32_to_8f32: 1034; SSE: # BB#0: 1035; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1036; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 1037; SSE-NEXT: retq 1038; 1039; AVX-LABEL: sitofp_8i32_to_8f32: 1040; AVX: # BB#0: 1041; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 1042; AVX-NEXT: retq 1043 %cvt = sitofp <8 x i32> %a to <8 x float> 1044 ret <8 x float> %cvt 1045} 1046 1047define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { 1048; SSE-LABEL: sitofp_8i16_to_8f32: 1049; SSE: # BB#0: 1050; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1051; SSE-NEXT: psrad $16, %xmm1 1052; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 1053; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1054; SSE-NEXT: psrad $16, %xmm0 1055; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1056; SSE-NEXT: movaps %xmm2, %xmm0 1057; SSE-NEXT: retq 1058; 1059; AVX1-LABEL: sitofp_8i16_to_8f32: 1060; AVX1: # BB#0: 1061; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 1062; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1063; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 1064; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1065; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1066; AVX1-NEXT: retq 1067; 1068; AVX2-LABEL: sitofp_8i16_to_8f32: 1069; AVX2: # BB#0: 1070; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1071; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1072; AVX2-NEXT: retq 1073 %cvt = sitofp <8 x i16> %a to <8 x float> 1074 ret <8 x float> %cvt 1075} 1076 1077define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) { 1078; SSE-LABEL: sitofp_8i8_to_8f32: 1079; SSE: # BB#0: 1080; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1081; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1082; SSE-NEXT: psrad $24, %xmm1 1083; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 1084; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1085; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1086; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1087; SSE-NEXT: psrad $24, %xmm0 1088; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1089; SSE-NEXT: movaps %xmm2, %xmm0 1090; SSE-NEXT: retq 1091; 1092; AVX1-LABEL: sitofp_8i8_to_8f32: 1093; AVX1: # BB#0: 1094; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 1095; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1096; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 1097; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1098; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1099; AVX1-NEXT: retq 1100; 1101; AVX2-LABEL: sitofp_8i8_to_8f32: 1102; AVX2: # BB#0: 1103; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 1104; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1105; AVX2-NEXT: retq 1106 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1107 %cvt = sitofp <8 x i8> %shuf to <8 x float> 1108 ret <8 x float> %cvt 1109} 1110 1111define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { 1112; SSE-LABEL: sitofp_16i8_to_8f32: 1113; SSE: # BB#0: 1114; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1115; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 1116; SSE-NEXT: psrad $24, %xmm1 1117; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 1118; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1119; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1120; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1121; SSE-NEXT: psrad $24, %xmm0 1122; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1123; SSE-NEXT: movaps %xmm2, %xmm0 1124; SSE-NEXT: retq 1125; 1126; AVX1-LABEL: sitofp_16i8_to_8f32: 1127; AVX1: # BB#0: 1128; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 1129; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1130; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 1131; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1132; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1133; AVX1-NEXT: retq 1134; 1135; AVX2-LABEL: sitofp_16i8_to_8f32: 1136; AVX2: # BB#0: 1137; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1138; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 1139; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1140; AVX2-NEXT: retq 1141 %cvt = sitofp <16 x i8> %a to <16 x float> 1142 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1143 ret <8 x float> %shuf 1144} 1145 1146; 1147; Unsigned Integer to Float 1148; 1149 1150define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { 1151; SSE-LABEL: uitofp_2i64_to_4f32: 1152; SSE: # BB#0: 1153; SSE-NEXT: movdqa %xmm0, %xmm1 1154; SSE-NEXT: movd %xmm1, %rax 1155; SSE-NEXT: movl %eax, %ecx 1156; SSE-NEXT: andl $1, %ecx 1157; SSE-NEXT: testq %rax, %rax 1158; SSE-NEXT: js .LBB38_1 1159; SSE-NEXT: # BB#2: 1160; SSE-NEXT: xorps %xmm0, %xmm0 1161; SSE-NEXT: cvtsi2ssq %rax, %xmm0 1162; SSE-NEXT: jmp .LBB38_3 1163; SSE-NEXT: .LBB38_1: 1164; SSE-NEXT: shrq %rax 1165; SSE-NEXT: orq %rax, %rcx 1166; SSE-NEXT: xorps %xmm0, %xmm0 1167; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 1168; SSE-NEXT: addss %xmm0, %xmm0 1169; SSE-NEXT: .LBB38_3: 1170; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1171; SSE-NEXT: movd %xmm1, %rax 1172; SSE-NEXT: movl %eax, %ecx 1173; SSE-NEXT: andl $1, %ecx 1174; SSE-NEXT: testq %rax, %rax 1175; SSE-NEXT: js .LBB38_4 1176; SSE-NEXT: # BB#5: 1177; SSE-NEXT: xorps %xmm1, %xmm1 1178; SSE-NEXT: cvtsi2ssq %rax, %xmm1 1179; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1180; SSE-NEXT: retq 1181; SSE-NEXT: .LBB38_4: 1182; SSE-NEXT: shrq %rax 1183; SSE-NEXT: orq %rax, %rcx 1184; SSE-NEXT: xorps %xmm1, %xmm1 1185; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 1186; SSE-NEXT: addss %xmm1, %xmm1 1187; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1188; SSE-NEXT: retq 1189; 1190; AVX-LABEL: uitofp_2i64_to_4f32: 1191; AVX: # BB#0: 1192; AVX-NEXT: vpextrq $1, %xmm0, %rax 1193; AVX-NEXT: movl %eax, %ecx 1194; AVX-NEXT: andl $1, %ecx 1195; AVX-NEXT: testq %rax, %rax 1196; AVX-NEXT: js .LBB38_1 1197; AVX-NEXT: # BB#2: 1198; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1199; AVX-NEXT: jmp .LBB38_3 1200; AVX-NEXT: .LBB38_1: 1201; AVX-NEXT: shrq %rax 1202; AVX-NEXT: orq %rax, %rcx 1203; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1204; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 1205; AVX-NEXT: .LBB38_3: 1206; AVX-NEXT: vmovq %xmm0, %rax 1207; AVX-NEXT: movl %eax, %ecx 1208; AVX-NEXT: andl $1, %ecx 1209; AVX-NEXT: testq %rax, %rax 1210; AVX-NEXT: js .LBB38_4 1211; AVX-NEXT: # BB#5: 1212; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1213; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1214; AVX-NEXT: jmp .LBB38_6 1215; AVX-NEXT: .LBB38_4: 1216; AVX-NEXT: shrq %rax 1217; AVX-NEXT: orq %rax, %rcx 1218; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1219; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1220; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 1221; AVX-NEXT: .LBB38_6: 1222; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 1223; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1224; AVX-NEXT: testq %rax, %rax 1225; AVX-NEXT: js .LBB38_8 1226; AVX-NEXT: # BB#7: 1227; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1228; AVX-NEXT: .LBB38_8: 1229; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 1230; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 1231; AVX-NEXT: retq 1232 %cvt = uitofp <2 x i64> %a to <2 x float> 1233 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1234 ret <4 x float> %ext 1235} 1236 1237define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { 1238; SSE-LABEL: uitofp_4i64_to_4f32_undef: 1239; SSE: # BB#0: 1240; SSE-NEXT: movdqa %xmm0, %xmm1 1241; SSE-NEXT: testq %rax, %rax 1242; SSE-NEXT: xorps %xmm2, %xmm2 1243; SSE-NEXT: js .LBB39_2 1244; SSE-NEXT: # BB#1: 1245; SSE-NEXT: xorps %xmm2, %xmm2 1246; SSE-NEXT: cvtsi2ssq %rax, %xmm2 1247; SSE-NEXT: .LBB39_2: 1248; SSE-NEXT: movd %xmm1, %rax 1249; SSE-NEXT: movl %eax, %ecx 1250; SSE-NEXT: andl $1, %ecx 1251; SSE-NEXT: testq %rax, %rax 1252; SSE-NEXT: js .LBB39_3 1253; SSE-NEXT: # BB#4: 1254; SSE-NEXT: xorps %xmm0, %xmm0 1255; SSE-NEXT: cvtsi2ssq %rax, %xmm0 1256; SSE-NEXT: jmp .LBB39_5 1257; SSE-NEXT: .LBB39_3: 1258; SSE-NEXT: shrq %rax 1259; SSE-NEXT: orq %rax, %rcx 1260; SSE-NEXT: xorps %xmm0, %xmm0 1261; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 1262; SSE-NEXT: addss %xmm0, %xmm0 1263; SSE-NEXT: .LBB39_5: 1264; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1265; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1266; SSE-NEXT: movd %xmm1, %rax 1267; SSE-NEXT: movl %eax, %ecx 1268; SSE-NEXT: andl $1, %ecx 1269; SSE-NEXT: testq %rax, %rax 1270; SSE-NEXT: js .LBB39_6 1271; SSE-NEXT: # BB#7: 1272; SSE-NEXT: xorps %xmm1, %xmm1 1273; SSE-NEXT: cvtsi2ssq %rax, %xmm1 1274; SSE-NEXT: jmp .LBB39_8 1275; SSE-NEXT: .LBB39_6: 1276; SSE-NEXT: shrq %rax 1277; SSE-NEXT: orq %rax, %rcx 1278; SSE-NEXT: xorps %xmm1, %xmm1 1279; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 1280; SSE-NEXT: addss %xmm1, %xmm1 1281; SSE-NEXT: .LBB39_8: 1282; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1283; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1284; SSE-NEXT: retq 1285; 1286; AVX-LABEL: uitofp_4i64_to_4f32_undef: 1287; AVX: # BB#0: 1288; AVX-NEXT: vpextrq $1, %xmm0, %rax 1289; AVX-NEXT: movl %eax, %ecx 1290; AVX-NEXT: andl $1, %ecx 1291; AVX-NEXT: testq %rax, %rax 1292; AVX-NEXT: js .LBB39_1 1293; AVX-NEXT: # BB#2: 1294; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1295; AVX-NEXT: jmp .LBB39_3 1296; AVX-NEXT: .LBB39_1: 1297; AVX-NEXT: shrq %rax 1298; AVX-NEXT: orq %rax, %rcx 1299; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1300; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 1301; AVX-NEXT: .LBB39_3: 1302; AVX-NEXT: vmovq %xmm0, %rax 1303; AVX-NEXT: movl %eax, %ecx 1304; AVX-NEXT: andl $1, %ecx 1305; AVX-NEXT: testq %rax, %rax 1306; AVX-NEXT: js .LBB39_4 1307; AVX-NEXT: # BB#5: 1308; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1309; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1310; AVX-NEXT: jmp .LBB39_6 1311; AVX-NEXT: .LBB39_4: 1312; AVX-NEXT: shrq %rax 1313; AVX-NEXT: orq %rax, %rcx 1314; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1315; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1316; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 1317; AVX-NEXT: .LBB39_6: 1318; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 1319; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1320; AVX-NEXT: testq %rax, %rax 1321; AVX-NEXT: js .LBB39_8 1322; AVX-NEXT: # BB#7: 1323; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1324; AVX-NEXT: .LBB39_8: 1325; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 1326; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 1327; AVX-NEXT: retq 1328 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1329 %cvt = uitofp <4 x i64> %ext to <4 x float> 1330 ret <4 x float> %cvt 1331} 1332 1333define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) { 1334; SSE-LABEL: uitofp_4i32_to_4f32: 1335; SSE: # BB#0: 1336; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 1337; SSE-NEXT: pand %xmm0, %xmm1 1338; SSE-NEXT: por {{.*}}(%rip), %xmm1 1339; SSE-NEXT: psrld $16, %xmm0 1340; SSE-NEXT: por {{.*}}(%rip), %xmm0 1341; SSE-NEXT: addps {{.*}}(%rip), %xmm0 1342; SSE-NEXT: addps %xmm1, %xmm0 1343; SSE-NEXT: retq 1344; 1345; AVX1-LABEL: uitofp_4i32_to_4f32: 1346; AVX1: # BB#0: 1347; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] 1348; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1349; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] 1350; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 1351; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 1352; AVX1-NEXT: retq 1353; 1354; AVX2-LABEL: uitofp_4i32_to_4f32: 1355; AVX2: # BB#0: 1356; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 1357; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1358; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 1359; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 1360; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 1361; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 1362; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 1363; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 1364; AVX2-NEXT: retq 1365 %cvt = uitofp <4 x i32> %a to <4 x float> 1366 ret <4 x float> %cvt 1367} 1368 1369define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) { 1370; SSE-LABEL: uitofp_4i16_to_4f32: 1371; SSE: # BB#0: 1372; SSE-NEXT: pxor %xmm1, %xmm1 1373; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1374; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1375; SSE-NEXT: retq 1376; 1377; AVX-LABEL: uitofp_4i16_to_4f32: 1378; AVX: # BB#0: 1379; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1380; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 1381; AVX-NEXT: retq 1382 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1383 %cvt = uitofp <4 x i16> %shuf to <4 x float> 1384 ret <4 x float> %cvt 1385} 1386 1387define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) { 1388; SSE-LABEL: uitofp_8i16_to_4f32: 1389; SSE: # BB#0: 1390; SSE-NEXT: pxor %xmm1, %xmm1 1391; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1392; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1393; SSE-NEXT: retq 1394; 1395; AVX1-LABEL: uitofp_8i16_to_4f32: 1396; AVX1: # BB#0: 1397; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1398; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1399; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1400; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1401; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1402; AVX1-NEXT: # kill 1403; AVX1-NEXT: vzeroupper 1404; AVX1-NEXT: retq 1405; 1406; AVX2-LABEL: uitofp_8i16_to_4f32: 1407; AVX2: # BB#0: 1408; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1409; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1410; AVX2-NEXT: # kill 1411; AVX2-NEXT: vzeroupper 1412; AVX2-NEXT: retq 1413 %cvt = uitofp <8 x i16> %a to <8 x float> 1414 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1415 ret <4 x float> %shuf 1416} 1417 1418define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) { 1419; SSE-LABEL: uitofp_4i8_to_4f32: 1420; SSE: # BB#0: 1421; SSE-NEXT: pxor %xmm1, %xmm1 1422; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1423; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1424; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1425; SSE-NEXT: retq 1426; 1427; AVX-LABEL: uitofp_4i8_to_4f32: 1428; AVX: # BB#0: 1429; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1430; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 1431; AVX-NEXT: retq 1432 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1433 %cvt = uitofp <4 x i8> %shuf to <4 x float> 1434 ret <4 x float> %cvt 1435} 1436 1437define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { 1438; SSE-LABEL: uitofp_16i8_to_4f32: 1439; SSE: # BB#0: 1440; SSE-NEXT: pxor %xmm1, %xmm1 1441; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1442; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1443; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 1444; SSE-NEXT: retq 1445; 1446; AVX1-LABEL: uitofp_16i8_to_4f32: 1447; AVX1: # BB#0: 1448; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1449; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1450; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1451; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1452; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1453; AVX1-NEXT: # kill 1454; AVX1-NEXT: vzeroupper 1455; AVX1-NEXT: retq 1456; 1457; AVX2-LABEL: uitofp_16i8_to_4f32: 1458; AVX2: # BB#0: 1459; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1460; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1461; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1462; AVX2-NEXT: # kill 1463; AVX2-NEXT: vzeroupper 1464; AVX2-NEXT: retq 1465 %cvt = uitofp <16 x i8> %a to <16 x float> 1466 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1467 ret <4 x float> %shuf 1468} 1469 1470define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { 1471; SSE-LABEL: uitofp_4i64_to_4f32: 1472; SSE: # BB#0: 1473; SSE-NEXT: movd %xmm1, %rax 1474; SSE-NEXT: movl %eax, %ecx 1475; SSE-NEXT: andl $1, %ecx 1476; SSE-NEXT: testq %rax, %rax 1477; SSE-NEXT: js .LBB45_1 1478; SSE-NEXT: # BB#2: 1479; SSE-NEXT: cvtsi2ssq %rax, %xmm3 1480; SSE-NEXT: jmp .LBB45_3 1481; SSE-NEXT: .LBB45_1: 1482; SSE-NEXT: shrq %rax 1483; SSE-NEXT: orq %rax, %rcx 1484; SSE-NEXT: cvtsi2ssq %rcx, %xmm3 1485; SSE-NEXT: addss %xmm3, %xmm3 1486; SSE-NEXT: .LBB45_3: 1487; SSE-NEXT: movd %xmm0, %rax 1488; SSE-NEXT: movl %eax, %ecx 1489; SSE-NEXT: andl $1, %ecx 1490; SSE-NEXT: testq %rax, %rax 1491; SSE-NEXT: js .LBB45_4 1492; SSE-NEXT: # BB#5: 1493; SSE-NEXT: cvtsi2ssq %rax, %xmm2 1494; SSE-NEXT: jmp .LBB45_6 1495; SSE-NEXT: .LBB45_4: 1496; SSE-NEXT: shrq %rax 1497; SSE-NEXT: orq %rax, %rcx 1498; SSE-NEXT: cvtsi2ssq %rcx, %xmm2 1499; SSE-NEXT: addss %xmm2, %xmm2 1500; SSE-NEXT: .LBB45_6: 1501; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1502; SSE-NEXT: movd %xmm1, %rax 1503; SSE-NEXT: movl %eax, %ecx 1504; SSE-NEXT: andl $1, %ecx 1505; SSE-NEXT: testq %rax, %rax 1506; SSE-NEXT: js .LBB45_7 1507; SSE-NEXT: # BB#8: 1508; SSE-NEXT: xorps %xmm1, %xmm1 1509; SSE-NEXT: cvtsi2ssq %rax, %xmm1 1510; SSE-NEXT: jmp .LBB45_9 1511; SSE-NEXT: .LBB45_7: 1512; SSE-NEXT: shrq %rax 1513; SSE-NEXT: orq %rax, %rcx 1514; SSE-NEXT: xorps %xmm1, %xmm1 1515; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 1516; SSE-NEXT: addss %xmm1, %xmm1 1517; SSE-NEXT: .LBB45_9: 1518; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1519; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1520; SSE-NEXT: movd %xmm0, %rax 1521; SSE-NEXT: movl %eax, %ecx 1522; SSE-NEXT: andl $1, %ecx 1523; SSE-NEXT: testq %rax, %rax 1524; SSE-NEXT: js .LBB45_10 1525; SSE-NEXT: # BB#11: 1526; SSE-NEXT: xorps %xmm0, %xmm0 1527; SSE-NEXT: cvtsi2ssq %rax, %xmm0 1528; SSE-NEXT: jmp .LBB45_12 1529; SSE-NEXT: .LBB45_10: 1530; SSE-NEXT: shrq %rax 1531; SSE-NEXT: orq %rax, %rcx 1532; SSE-NEXT: xorps %xmm0, %xmm0 1533; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 1534; SSE-NEXT: addss %xmm0, %xmm0 1535; SSE-NEXT: .LBB45_12: 1536; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1537; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1538; SSE-NEXT: movaps %xmm2, %xmm0 1539; SSE-NEXT: retq 1540; 1541; AVX1-LABEL: uitofp_4i64_to_4f32: 1542; AVX1: # BB#0: 1543; AVX1-NEXT: vpextrq $1, %xmm0, %rax 1544; AVX1-NEXT: movl %eax, %ecx 1545; AVX1-NEXT: andl $1, %ecx 1546; AVX1-NEXT: testq %rax, %rax 1547; AVX1-NEXT: js .LBB45_1 1548; AVX1-NEXT: # BB#2: 1549; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1550; AVX1-NEXT: jmp .LBB45_3 1551; AVX1-NEXT: .LBB45_1: 1552; AVX1-NEXT: shrq %rax 1553; AVX1-NEXT: orq %rax, %rcx 1554; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1555; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 1556; AVX1-NEXT: .LBB45_3: 1557; AVX1-NEXT: vmovq %xmm0, %rax 1558; AVX1-NEXT: movl %eax, %ecx 1559; AVX1-NEXT: andl $1, %ecx 1560; AVX1-NEXT: testq %rax, %rax 1561; AVX1-NEXT: js .LBB45_4 1562; AVX1-NEXT: # BB#5: 1563; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1564; AVX1-NEXT: jmp .LBB45_6 1565; AVX1-NEXT: .LBB45_4: 1566; AVX1-NEXT: shrq %rax 1567; AVX1-NEXT: orq %rax, %rcx 1568; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1569; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 1570; AVX1-NEXT: .LBB45_6: 1571; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1572; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1573; AVX1-NEXT: vmovq %xmm0, %rax 1574; AVX1-NEXT: movl %eax, %ecx 1575; AVX1-NEXT: andl $1, %ecx 1576; AVX1-NEXT: testq %rax, %rax 1577; AVX1-NEXT: js .LBB45_7 1578; AVX1-NEXT: # BB#8: 1579; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1580; AVX1-NEXT: jmp .LBB45_9 1581; AVX1-NEXT: .LBB45_7: 1582; AVX1-NEXT: shrq %rax 1583; AVX1-NEXT: orq %rax, %rcx 1584; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1585; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 1586; AVX1-NEXT: .LBB45_9: 1587; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1588; AVX1-NEXT: vpextrq $1, %xmm0, %rax 1589; AVX1-NEXT: movl %eax, %ecx 1590; AVX1-NEXT: andl $1, %ecx 1591; AVX1-NEXT: testq %rax, %rax 1592; AVX1-NEXT: js .LBB45_10 1593; AVX1-NEXT: # BB#11: 1594; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 1595; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1596; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1597; AVX1-NEXT: vzeroupper 1598; AVX1-NEXT: retq 1599; AVX1-NEXT: .LBB45_10: 1600; AVX1-NEXT: shrq %rax 1601; AVX1-NEXT: orq %rax, %rcx 1602; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 1603; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1604; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 1605; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1606; AVX1-NEXT: vzeroupper 1607; AVX1-NEXT: retq 1608; 1609; AVX2-LABEL: uitofp_4i64_to_4f32: 1610; AVX2: # BB#0: 1611; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1612; AVX2-NEXT: movl %eax, %ecx 1613; AVX2-NEXT: andl $1, %ecx 1614; AVX2-NEXT: testq %rax, %rax 1615; AVX2-NEXT: js .LBB45_1 1616; AVX2-NEXT: # BB#2: 1617; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 1618; AVX2-NEXT: jmp .LBB45_3 1619; AVX2-NEXT: .LBB45_1: 1620; AVX2-NEXT: shrq %rax 1621; AVX2-NEXT: orq %rax, %rcx 1622; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 1623; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 1624; AVX2-NEXT: .LBB45_3: 1625; AVX2-NEXT: vmovq %xmm0, %rax 1626; AVX2-NEXT: movl %eax, %ecx 1627; AVX2-NEXT: andl $1, %ecx 1628; AVX2-NEXT: testq %rax, %rax 1629; AVX2-NEXT: js .LBB45_4 1630; AVX2-NEXT: # BB#5: 1631; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1632; AVX2-NEXT: jmp .LBB45_6 1633; AVX2-NEXT: .LBB45_4: 1634; AVX2-NEXT: shrq %rax 1635; AVX2-NEXT: orq %rax, %rcx 1636; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1637; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 1638; AVX2-NEXT: .LBB45_6: 1639; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 1640; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1641; AVX2-NEXT: vmovq %xmm0, %rax 1642; AVX2-NEXT: movl %eax, %ecx 1643; AVX2-NEXT: andl $1, %ecx 1644; AVX2-NEXT: testq %rax, %rax 1645; AVX2-NEXT: js .LBB45_7 1646; AVX2-NEXT: # BB#8: 1647; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 1648; AVX2-NEXT: jmp .LBB45_9 1649; AVX2-NEXT: .LBB45_7: 1650; AVX2-NEXT: shrq %rax 1651; AVX2-NEXT: orq %rax, %rcx 1652; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 1653; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 1654; AVX2-NEXT: .LBB45_9: 1655; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1656; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1657; AVX2-NEXT: movl %eax, %ecx 1658; AVX2-NEXT: andl $1, %ecx 1659; AVX2-NEXT: testq %rax, %rax 1660; AVX2-NEXT: js .LBB45_10 1661; AVX2-NEXT: # BB#11: 1662; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 1663; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 1664; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1665; AVX2-NEXT: vzeroupper 1666; AVX2-NEXT: retq 1667; AVX2-NEXT: .LBB45_10: 1668; AVX2-NEXT: shrq %rax 1669; AVX2-NEXT: orq %rax, %rcx 1670; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 1671; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 1672; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 1673; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1674; AVX2-NEXT: vzeroupper 1675; AVX2-NEXT: retq 1676 %cvt = uitofp <4 x i64> %a to <4 x float> 1677 ret <4 x float> %cvt 1678} 1679 1680define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) { 1681; SSE-LABEL: uitofp_8i32_to_8f32: 1682; SSE: # BB#0: 1683; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] 1684; SSE-NEXT: movdqa %xmm0, %xmm3 1685; SSE-NEXT: pand %xmm2, %xmm3 1686; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] 1687; SSE-NEXT: por %xmm4, %xmm3 1688; SSE-NEXT: psrld $16, %xmm0 1689; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] 1690; SSE-NEXT: por %xmm5, %xmm0 1691; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] 1692; SSE-NEXT: addps %xmm6, %xmm0 1693; SSE-NEXT: addps %xmm3, %xmm0 1694; SSE-NEXT: pand %xmm1, %xmm2 1695; SSE-NEXT: por %xmm4, %xmm2 1696; SSE-NEXT: psrld $16, %xmm1 1697; SSE-NEXT: por %xmm5, %xmm1 1698; SSE-NEXT: addps %xmm6, %xmm1 1699; SSE-NEXT: addps %xmm2, %xmm1 1700; SSE-NEXT: retq 1701; 1702; AVX1-LABEL: uitofp_8i32_to_8f32: 1703; AVX1: # BB#0: 1704; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 1705; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 1706; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 1707; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1708; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 1709; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1710; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1711; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1712; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 1713; AVX1-NEXT: retq 1714; 1715; AVX2-LABEL: uitofp_8i32_to_8f32: 1716; AVX2: # BB#0: 1717; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 1718; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 1719; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 1720; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 1721; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 1722; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 1723; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 1724; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 1725; AVX2-NEXT: retq 1726 %cvt = uitofp <8 x i32> %a to <8 x float> 1727 ret <8 x float> %cvt 1728} 1729 1730define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { 1731; SSE-LABEL: uitofp_8i16_to_8f32: 1732; SSE: # BB#0: 1733; SSE-NEXT: pxor %xmm1, %xmm1 1734; SSE-NEXT: movdqa %xmm0, %xmm2 1735; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1736; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 1737; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1738; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1739; SSE-NEXT: movaps %xmm2, %xmm0 1740; SSE-NEXT: retq 1741; 1742; AVX1-LABEL: uitofp_8i16_to_8f32: 1743; AVX1: # BB#0: 1744; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1745; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1746; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1747; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1748; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1749; AVX1-NEXT: retq 1750; 1751; AVX2-LABEL: uitofp_8i16_to_8f32: 1752; AVX2: # BB#0: 1753; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1754; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1755; AVX2-NEXT: retq 1756 %cvt = uitofp <8 x i16> %a to <8 x float> 1757 ret <8 x float> %cvt 1758} 1759 1760define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { 1761; SSE-LABEL: uitofp_8i8_to_8f32: 1762; SSE: # BB#0: 1763; SSE-NEXT: pxor %xmm1, %xmm1 1764; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1765; SSE-NEXT: movdqa %xmm0, %xmm2 1766; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1767; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 1768; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1769; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1770; SSE-NEXT: movaps %xmm2, %xmm0 1771; SSE-NEXT: retq 1772; 1773; AVX1-LABEL: uitofp_8i8_to_8f32: 1774; AVX1: # BB#0: 1775; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1776; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1777; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1778; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1779; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1780; AVX1-NEXT: retq 1781; 1782; AVX2-LABEL: uitofp_8i8_to_8f32: 1783; AVX2: # BB#0: 1784; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1785; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1786; AVX2-NEXT: retq 1787 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1788 %cvt = uitofp <8 x i8> %shuf to <8 x float> 1789 ret <8 x float> %cvt 1790} 1791 1792define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { 1793; SSE-LABEL: uitofp_16i8_to_8f32: 1794; SSE: # BB#0: 1795; SSE-NEXT: pxor %xmm1, %xmm1 1796; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1797; SSE-NEXT: movdqa %xmm0, %xmm2 1798; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1799; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 1800; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1801; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 1802; SSE-NEXT: movaps %xmm2, %xmm0 1803; SSE-NEXT: retq 1804; 1805; AVX1-LABEL: uitofp_16i8_to_8f32: 1806; AVX1: # BB#0: 1807; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1808; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 1809; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1810; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1811; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 1812; AVX1-NEXT: retq 1813; 1814; AVX2-LABEL: uitofp_16i8_to_8f32: 1815; AVX2: # BB#0: 1816; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1817; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1818; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 1819; AVX2-NEXT: retq 1820 %cvt = uitofp <16 x i8> %a to <16 x float> 1821 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1822 ret <8 x float> %shuf 1823} 1824 1825; 1826; Load Signed Integer to Double 1827; 1828 1829define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { 1830; SSE-LABEL: sitofp_load_2i64_to_2f64: 1831; SSE: # BB#0: 1832; SSE-NEXT: movdqa (%rdi), %xmm1 1833; SSE-NEXT: movd %xmm1, %rax 1834; SSE-NEXT: cvtsi2sdq %rax, %xmm0 1835; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1836; SSE-NEXT: movd %xmm1, %rax 1837; SSE-NEXT: xorps %xmm1, %xmm1 1838; SSE-NEXT: cvtsi2sdq %rax, %xmm1 1839; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1840; SSE-NEXT: retq 1841; 1842; AVX-LABEL: sitofp_load_2i64_to_2f64: 1843; AVX: # BB#0: 1844; AVX-NEXT: vmovdqa (%rdi), %xmm0 1845; AVX-NEXT: vpextrq $1, %xmm0, %rax 1846; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 1847; AVX-NEXT: vmovq %xmm0, %rax 1848; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1849; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 1850; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1851; AVX-NEXT: retq 1852 %ld = load <2 x i64>, <2 x i64> *%a 1853 %cvt = sitofp <2 x i64> %ld to <2 x double> 1854 ret <2 x double> %cvt 1855} 1856 1857define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) { 1858; SSE-LABEL: sitofp_load_2i32_to_2f64: 1859; SSE: # BB#0: 1860; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 1861; SSE-NEXT: retq 1862; 1863; AVX-LABEL: sitofp_load_2i32_to_2f64: 1864; AVX: # BB#0: 1865; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 1866; AVX-NEXT: retq 1867 %ld = load <2 x i32>, <2 x i32> *%a 1868 %cvt = sitofp <2 x i32> %ld to <2 x double> 1869 ret <2 x double> %cvt 1870} 1871 1872define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) { 1873; SSE-LABEL: sitofp_load_2i16_to_2f64: 1874; SSE: # BB#0: 1875; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1876; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1877; SSE-NEXT: psrad $16, %xmm0 1878; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 1879; SSE-NEXT: retq 1880; 1881; AVX-LABEL: sitofp_load_2i16_to_2f64: 1882; AVX: # BB#0: 1883; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 1884; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1885; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 1886; AVX-NEXT: retq 1887 %ld = load <2 x i16>, <2 x i16> *%a 1888 %cvt = sitofp <2 x i16> %ld to <2 x double> 1889 ret <2 x double> %cvt 1890} 1891 1892define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) { 1893; SSE-LABEL: sitofp_load_2i8_to_2f64: 1894; SSE: # BB#0: 1895; SSE-NEXT: movzwl (%rdi), %eax 1896; SSE-NEXT: movd %eax, %xmm0 1897; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1898; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1899; SSE-NEXT: psrad $24, %xmm0 1900; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 1901; SSE-NEXT: retq 1902; 1903; AVX-LABEL: sitofp_load_2i8_to_2f64: 1904; AVX: # BB#0: 1905; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 1906; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1907; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 1908; AVX-NEXT: retq 1909 %ld = load <2 x i8>, <2 x i8> *%a 1910 %cvt = sitofp <2 x i8> %ld to <2 x double> 1911 ret <2 x double> %cvt 1912} 1913 1914define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { 1915; SSE-LABEL: sitofp_load_4i64_to_4f64: 1916; SSE: # BB#0: 1917; SSE-NEXT: movdqa (%rdi), %xmm1 1918; SSE-NEXT: movdqa 16(%rdi), %xmm2 1919; SSE-NEXT: movd %xmm1, %rax 1920; SSE-NEXT: cvtsi2sdq %rax, %xmm0 1921; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1922; SSE-NEXT: movd %xmm1, %rax 1923; SSE-NEXT: xorps %xmm1, %xmm1 1924; SSE-NEXT: cvtsi2sdq %rax, %xmm1 1925; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1926; SSE-NEXT: movd %xmm2, %rax 1927; SSE-NEXT: xorps %xmm1, %xmm1 1928; SSE-NEXT: cvtsi2sdq %rax, %xmm1 1929; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1930; SSE-NEXT: movd %xmm2, %rax 1931; SSE-NEXT: xorps %xmm2, %xmm2 1932; SSE-NEXT: cvtsi2sdq %rax, %xmm2 1933; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1934; SSE-NEXT: retq 1935; 1936; AVX1-LABEL: sitofp_load_4i64_to_4f64: 1937; AVX1: # BB#0: 1938; AVX1-NEXT: vmovaps (%rdi), %ymm0 1939; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1940; AVX1-NEXT: vpextrq $1, %xmm1, %rax 1941; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 1942; AVX1-NEXT: vmovq %xmm1, %rax 1943; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 1944; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1945; AVX1-NEXT: vpextrq $1, %xmm0, %rax 1946; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 1947; AVX1-NEXT: vmovq %xmm0, %rax 1948; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 1949; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 1950; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1951; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1952; AVX1-NEXT: retq 1953; 1954; AVX2-LABEL: sitofp_load_4i64_to_4f64: 1955; AVX2: # BB#0: 1956; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1957; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1958; AVX2-NEXT: vpextrq $1, %xmm1, %rax 1959; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 1960; AVX2-NEXT: vmovq %xmm1, %rax 1961; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 1962; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1963; AVX2-NEXT: vpextrq $1, %xmm0, %rax 1964; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 1965; AVX2-NEXT: vmovq %xmm0, %rax 1966; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 1967; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 1968; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1969; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1970; AVX2-NEXT: retq 1971 %ld = load <4 x i64>, <4 x i64> *%a 1972 %cvt = sitofp <4 x i64> %ld to <4 x double> 1973 ret <4 x double> %cvt 1974} 1975 1976define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) { 1977; SSE-LABEL: sitofp_load_4i32_to_4f64: 1978; SSE: # BB#0: 1979; SSE-NEXT: movdqa (%rdi), %xmm1 1980; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 1981; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1982; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 1983; SSE-NEXT: retq 1984; 1985; AVX-LABEL: sitofp_load_4i32_to_4f64: 1986; AVX: # BB#0: 1987; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0 1988; AVX-NEXT: retq 1989 %ld = load <4 x i32>, <4 x i32> *%a 1990 %cvt = sitofp <4 x i32> %ld to <4 x double> 1991 ret <4 x double> %cvt 1992} 1993 1994define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { 1995; SSE-LABEL: sitofp_load_4i16_to_4f64: 1996; SSE: # BB#0: 1997; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1998; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1999; SSE-NEXT: psrad $16, %xmm1 2000; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 2001; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2002; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 2003; SSE-NEXT: retq 2004; 2005; AVX-LABEL: sitofp_load_4i16_to_4f64: 2006; AVX: # BB#0: 2007; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 2008; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 2009; AVX-NEXT: retq 2010 %ld = load <4 x i16>, <4 x i16> *%a 2011 %cvt = sitofp <4 x i16> %ld to <4 x double> 2012 ret <4 x double> %cvt 2013} 2014 2015define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { 2016; SSE-LABEL: sitofp_load_4i8_to_4f64: 2017; SSE: # BB#0: 2018; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2019; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2020; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2021; SSE-NEXT: psrad $24, %xmm1 2022; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 2023; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2024; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 2025; SSE-NEXT: retq 2026; 2027; AVX-LABEL: sitofp_load_4i8_to_4f64: 2028; AVX: # BB#0: 2029; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 2030; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 2031; AVX-NEXT: retq 2032 %ld = load <4 x i8>, <4 x i8> *%a 2033 %cvt = sitofp <4 x i8> %ld to <4 x double> 2034 ret <4 x double> %cvt 2035} 2036 2037; 2038; Load Unsigned Integer to Double 2039; 2040 2041define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { 2042; SSE-LABEL: uitofp_load_2i64_to_2f64: 2043; SSE: # BB#0: 2044; SSE-NEXT: movdqa (%rdi), %xmm1 2045; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 2046; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 2047; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2048; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 2049; SSE-NEXT: subpd %xmm4, %xmm1 2050; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 2051; SSE-NEXT: addpd %xmm1, %xmm0 2052; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2053; SSE-NEXT: subpd %xmm4, %xmm3 2054; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] 2055; SSE-NEXT: addpd %xmm3, %xmm1 2056; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2057; SSE-NEXT: retq 2058; 2059; AVX-LABEL: uitofp_load_2i64_to_2f64: 2060; AVX: # BB#0: 2061; AVX-NEXT: vmovdqa (%rdi), %xmm0 2062; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 2063; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2064; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 2065; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 2066; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 2067; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2068; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2069; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 2070; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 2071; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 2072; AVX-NEXT: retq 2073 %ld = load <2 x i64>, <2 x i64> *%a 2074 %cvt = uitofp <2 x i64> %ld to <2 x double> 2075 ret <2 x double> %cvt 2076} 2077 2078define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { 2079; SSE-LABEL: uitofp_load_2i32_to_2f64: 2080; SSE: # BB#0: 2081; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2082; SSE-NEXT: pxor %xmm0, %xmm0 2083; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2084; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 2085; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] 2086; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2087; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 2088; SSE-NEXT: subpd %xmm4, %xmm1 2089; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 2090; SSE-NEXT: addpd %xmm1, %xmm0 2091; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2092; SSE-NEXT: subpd %xmm4, %xmm3 2093; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] 2094; SSE-NEXT: addpd %xmm3, %xmm1 2095; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2096; SSE-NEXT: retq 2097; 2098; AVX-LABEL: uitofp_load_2i32_to_2f64: 2099; AVX: # BB#0: 2100; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 2101; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] 2102; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2103; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] 2104; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 2105; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 2106; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2107; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2108; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 2109; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 2110; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] 2111; AVX-NEXT: retq 2112 %ld = load <2 x i32>, <2 x i32> *%a 2113 %cvt = uitofp <2 x i32> %ld to <2 x double> 2114 ret <2 x double> %cvt 2115} 2116 2117define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) { 2118; SSE-LABEL: uitofp_load_2i16_to_2f64: 2119; SSE: # BB#0: 2120; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2121; SSE-NEXT: pxor %xmm1, %xmm1 2122; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2123; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 2124; SSE-NEXT: retq 2125; 2126; AVX-LABEL: uitofp_load_2i16_to_2f64: 2127; AVX: # BB#0: 2128; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2129; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2130; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 2131; AVX-NEXT: retq 2132 %ld = load <2 x i16>, <2 x i16> *%a 2133 %cvt = uitofp <2 x i16> %ld to <2 x double> 2134 ret <2 x double> %cvt 2135} 2136 2137define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) { 2138; SSE-LABEL: uitofp_load_2i8_to_2f64: 2139; SSE: # BB#0: 2140; SSE-NEXT: movzwl (%rdi), %eax 2141; SSE-NEXT: movd %eax, %xmm0 2142; SSE-NEXT: pxor %xmm1, %xmm1 2143; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2144; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2145; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 2146; SSE-NEXT: retq 2147; 2148; AVX-LABEL: uitofp_load_2i8_to_2f64: 2149; AVX: # BB#0: 2150; AVX-NEXT: movzwl (%rdi), %eax 2151; AVX-NEXT: vmovd %eax, %xmm0 2152; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2153; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 2154; AVX-NEXT: retq 2155 %ld = load <2 x i8>, <2 x i8> *%a 2156 %cvt = uitofp <2 x i8> %ld to <2 x double> 2157 ret <2 x double> %cvt 2158} 2159 2160define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { 2161; SSE-LABEL: uitofp_load_4i64_to_4f64: 2162; SSE: # BB#0: 2163; SSE-NEXT: movdqa (%rdi), %xmm1 2164; SSE-NEXT: movdqa 16(%rdi), %xmm2 2165; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] 2166; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] 2167; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 2168; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25] 2169; SSE-NEXT: subpd %xmm5, %xmm1 2170; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 2171; SSE-NEXT: addpd %xmm1, %xmm0 2172; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 2173; SSE-NEXT: subpd %xmm5, %xmm4 2174; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1] 2175; SSE-NEXT: addpd %xmm4, %xmm1 2176; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2177; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] 2178; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 2179; SSE-NEXT: subpd %xmm5, %xmm2 2180; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 2181; SSE-NEXT: addpd %xmm2, %xmm1 2182; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 2183; SSE-NEXT: subpd %xmm5, %xmm4 2184; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] 2185; SSE-NEXT: addpd %xmm4, %xmm2 2186; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2187; SSE-NEXT: retq 2188; 2189; AVX1-LABEL: uitofp_load_4i64_to_4f64: 2190; AVX1: # BB#0: 2191; AVX1-NEXT: vmovaps (%rdi), %ymm0 2192; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2193; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 2194; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2195; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 2196; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 2197; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 2198; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2199; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2200; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 2201; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 2202; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] 2203; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2204; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 2205; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 2206; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2207; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2208; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 2209; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 2210; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] 2211; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2212; AVX1-NEXT: retq 2213; 2214; AVX2-LABEL: uitofp_load_4i64_to_4f64: 2215; AVX2: # BB#0: 2216; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2217; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2218; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] 2219; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2220; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] 2221; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 2222; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 2223; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2224; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2225; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1 2226; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 2227; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] 2228; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2229; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 2230; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 2231; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2232; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2233; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0 2234; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 2235; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] 2236; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2237; AVX2-NEXT: retq 2238 %ld = load <4 x i64>, <4 x i64> *%a 2239 %cvt = uitofp <4 x i64> %ld to <4 x double> 2240 ret <4 x double> %cvt 2241} 2242 2243define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { 2244; SSE-LABEL: uitofp_load_4i32_to_4f64: 2245; SSE: # BB#0: 2246; SSE-NEXT: movdqa (%rdi), %xmm2 2247; SSE-NEXT: pxor %xmm1, %xmm1 2248; SSE-NEXT: movdqa %xmm2, %xmm3 2249; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2250; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1127219200,1160773632,0,0] 2251; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] 2252; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2253; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.503600e+15,1.934281e+25] 2254; SSE-NEXT: subpd %xmm6, %xmm3 2255; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 2256; SSE-NEXT: addpd %xmm3, %xmm0 2257; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 2258; SSE-NEXT: subpd %xmm6, %xmm5 2259; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1] 2260; SSE-NEXT: addpd %xmm5, %xmm3 2261; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] 2262; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 2263; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] 2264; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2265; SSE-NEXT: subpd %xmm6, %xmm2 2266; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] 2267; SSE-NEXT: addpd %xmm2, %xmm1 2268; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2269; SSE-NEXT: subpd %xmm6, %xmm3 2270; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] 2271; SSE-NEXT: addpd %xmm3, %xmm2 2272; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2273; SSE-NEXT: retq 2274; 2275; AVX1-LABEL: uitofp_load_4i32_to_4f64: 2276; AVX1: # BB#0: 2277; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2278; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 2279; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 2280; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 2281; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 2282; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 2283; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 2284; AVX1-NEXT: retq 2285; 2286; AVX2-LABEL: uitofp_load_4i32_to_4f64: 2287; AVX2: # BB#0: 2288; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2289; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 2290; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 2291; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 2292; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 2293; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 2294; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 2295; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 2296; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 2297; AVX2-NEXT: retq 2298 %ld = load <4 x i32>, <4 x i32> *%a 2299 %cvt = uitofp <4 x i32> %ld to <4 x double> 2300 ret <4 x double> %cvt 2301} 2302 2303define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { 2304; SSE-LABEL: uitofp_load_4i16_to_4f64: 2305; SSE: # BB#0: 2306; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2307; SSE-NEXT: pxor %xmm0, %xmm0 2308; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2309; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 2310; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2311; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 2312; SSE-NEXT: retq 2313; 2314; AVX-LABEL: uitofp_load_4i16_to_4f64: 2315; AVX: # BB#0: 2316; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2317; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 2318; AVX-NEXT: retq 2319 %ld = load <4 x i16>, <4 x i16> *%a 2320 %cvt = uitofp <4 x i16> %ld to <4 x double> 2321 ret <4 x double> %cvt 2322} 2323 2324define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { 2325; SSE-LABEL: uitofp_load_4i8_to_4f64: 2326; SSE: # BB#0: 2327; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2328; SSE-NEXT: pxor %xmm0, %xmm0 2329; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2330; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2331; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 2332; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2333; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 2334; SSE-NEXT: retq 2335; 2336; AVX-LABEL: uitofp_load_4i8_to_4f64: 2337; AVX: # BB#0: 2338; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2339; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 2340; AVX-NEXT: retq 2341 %ld = load <4 x i8>, <4 x i8> *%a 2342 %cvt = uitofp <4 x i8> %ld to <4 x double> 2343 ret <4 x double> %cvt 2344} 2345 2346; 2347; Load Signed Integer to Float 2348; 2349 2350define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { 2351; SSE-LABEL: sitofp_load_4i64_to_4f32: 2352; SSE: # BB#0: 2353; SSE-NEXT: movdqa (%rdi), %xmm1 2354; SSE-NEXT: movdqa 16(%rdi), %xmm2 2355; SSE-NEXT: movd %xmm2, %rax 2356; SSE-NEXT: cvtsi2ssq %rax, %xmm3 2357; SSE-NEXT: movd %xmm1, %rax 2358; SSE-NEXT: cvtsi2ssq %rax, %xmm0 2359; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2360; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2361; SSE-NEXT: movd %xmm2, %rax 2362; SSE-NEXT: xorps %xmm2, %xmm2 2363; SSE-NEXT: cvtsi2ssq %rax, %xmm2 2364; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2365; SSE-NEXT: movd %xmm1, %rax 2366; SSE-NEXT: xorps %xmm1, %xmm1 2367; SSE-NEXT: cvtsi2ssq %rax, %xmm1 2368; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2369; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2370; SSE-NEXT: retq 2371; 2372; AVX1-LABEL: sitofp_load_4i64_to_4f32: 2373; AVX1: # BB#0: 2374; AVX1-NEXT: vmovdqa (%rdi), %ymm0 2375; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2376; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2377; AVX1-NEXT: vmovq %xmm0, %rax 2378; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2379; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 2380; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2381; AVX1-NEXT: vmovq %xmm0, %rax 2382; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2383; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 2384; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2385; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 2386; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2387; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2388; AVX1-NEXT: vzeroupper 2389; AVX1-NEXT: retq 2390; 2391; AVX2-LABEL: sitofp_load_4i64_to_4f32: 2392; AVX2: # BB#0: 2393; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2394; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2395; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2396; AVX2-NEXT: vmovq %xmm0, %rax 2397; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2398; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 2399; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2400; AVX2-NEXT: vmovq %xmm0, %rax 2401; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2402; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 2403; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2404; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 2405; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2406; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2407; AVX2-NEXT: vzeroupper 2408; AVX2-NEXT: retq 2409 %ld = load <4 x i64>, <4 x i64> *%a 2410 %cvt = sitofp <4 x i64> %ld to <4 x float> 2411 ret <4 x float> %cvt 2412} 2413 2414define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) { 2415; SSE-LABEL: sitofp_load_4i32_to_4f32: 2416; SSE: # BB#0: 2417; SSE-NEXT: cvtdq2ps (%rdi), %xmm0 2418; SSE-NEXT: retq 2419; 2420; AVX-LABEL: sitofp_load_4i32_to_4f32: 2421; AVX: # BB#0: 2422; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0 2423; AVX-NEXT: retq 2424 %ld = load <4 x i32>, <4 x i32> *%a 2425 %cvt = sitofp <4 x i32> %ld to <4 x float> 2426 ret <4 x float> %cvt 2427} 2428 2429define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) { 2430; SSE-LABEL: sitofp_load_4i16_to_4f32: 2431; SSE: # BB#0: 2432; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2433; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2434; SSE-NEXT: psrad $16, %xmm0 2435; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2436; SSE-NEXT: retq 2437; 2438; AVX-LABEL: sitofp_load_4i16_to_4f32: 2439; AVX: # BB#0: 2440; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 2441; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 2442; AVX-NEXT: retq 2443 %ld = load <4 x i16>, <4 x i16> *%a 2444 %cvt = sitofp <4 x i16> %ld to <4 x float> 2445 ret <4 x float> %cvt 2446} 2447 2448define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) { 2449; SSE-LABEL: sitofp_load_4i8_to_4f32: 2450; SSE: # BB#0: 2451; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2452; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2453; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2454; SSE-NEXT: psrad $24, %xmm0 2455; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2456; SSE-NEXT: retq 2457; 2458; AVX-LABEL: sitofp_load_4i8_to_4f32: 2459; AVX: # BB#0: 2460; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 2461; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 2462; AVX-NEXT: retq 2463 %ld = load <4 x i8>, <4 x i8> *%a 2464 %cvt = sitofp <4 x i8> %ld to <4 x float> 2465 ret <4 x float> %cvt 2466} 2467 2468define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { 2469; SSE-LABEL: sitofp_load_8i64_to_8f32: 2470; SSE: # BB#0: 2471; SSE-NEXT: movdqa (%rdi), %xmm1 2472; SSE-NEXT: movdqa 16(%rdi), %xmm2 2473; SSE-NEXT: movdqa 32(%rdi), %xmm3 2474; SSE-NEXT: movdqa 48(%rdi), %xmm4 2475; SSE-NEXT: movd %xmm2, %rax 2476; SSE-NEXT: cvtsi2ssq %rax, %xmm5 2477; SSE-NEXT: movd %xmm1, %rax 2478; SSE-NEXT: cvtsi2ssq %rax, %xmm0 2479; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 2480; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2481; SSE-NEXT: movd %xmm2, %rax 2482; SSE-NEXT: xorps %xmm2, %xmm2 2483; SSE-NEXT: cvtsi2ssq %rax, %xmm2 2484; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2485; SSE-NEXT: movd %xmm1, %rax 2486; SSE-NEXT: xorps %xmm1, %xmm1 2487; SSE-NEXT: cvtsi2ssq %rax, %xmm1 2488; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2489; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2490; SSE-NEXT: movd %xmm4, %rax 2491; SSE-NEXT: xorps %xmm2, %xmm2 2492; SSE-NEXT: cvtsi2ssq %rax, %xmm2 2493; SSE-NEXT: movd %xmm3, %rax 2494; SSE-NEXT: xorps %xmm1, %xmm1 2495; SSE-NEXT: cvtsi2ssq %rax, %xmm1 2496; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2497; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] 2498; SSE-NEXT: movd %xmm2, %rax 2499; SSE-NEXT: xorps %xmm2, %xmm2 2500; SSE-NEXT: cvtsi2ssq %rax, %xmm2 2501; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2502; SSE-NEXT: movd %xmm3, %rax 2503; SSE-NEXT: xorps %xmm3, %xmm3 2504; SSE-NEXT: cvtsi2ssq %rax, %xmm3 2505; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2506; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 2507; SSE-NEXT: retq 2508; 2509; AVX1-LABEL: sitofp_load_8i64_to_8f32: 2510; AVX1: # BB#0: 2511; AVX1-NEXT: vmovdqa (%rdi), %ymm0 2512; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 2513; AVX1-NEXT: vpextrq $1, %xmm1, %rax 2514; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2515; AVX1-NEXT: vmovq %xmm1, %rax 2516; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2517; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2518; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2519; AVX1-NEXT: vmovq %xmm1, %rax 2520; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2521; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] 2522; AVX1-NEXT: vpextrq $1, %xmm1, %rax 2523; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2524; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] 2525; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2526; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2527; AVX1-NEXT: vmovq %xmm0, %rax 2528; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2529; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2530; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2531; AVX1-NEXT: vmovq %xmm0, %rax 2532; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2533; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] 2534; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2535; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 2536; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2537; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 2538; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2539; AVX1-NEXT: retq 2540; 2541; AVX2-LABEL: sitofp_load_8i64_to_8f32: 2542; AVX2: # BB#0: 2543; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2544; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2545; AVX2-NEXT: vpextrq $1, %xmm1, %rax 2546; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2547; AVX2-NEXT: vmovq %xmm1, %rax 2548; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2549; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2550; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 2551; AVX2-NEXT: vmovq %xmm1, %rax 2552; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2553; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] 2554; AVX2-NEXT: vpextrq $1, %xmm1, %rax 2555; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2556; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] 2557; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2558; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2559; AVX2-NEXT: vmovq %xmm0, %rax 2560; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2561; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2562; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2563; AVX2-NEXT: vmovq %xmm0, %rax 2564; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 2565; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] 2566; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2567; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 2568; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2569; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 2570; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2571; AVX2-NEXT: retq 2572 %ld = load <8 x i64>, <8 x i64> *%a 2573 %cvt = sitofp <8 x i64> %ld to <8 x float> 2574 ret <8 x float> %cvt 2575} 2576 2577define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) { 2578; SSE-LABEL: sitofp_load_8i32_to_8f32: 2579; SSE: # BB#0: 2580; SSE-NEXT: cvtdq2ps (%rdi), %xmm0 2581; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1 2582; SSE-NEXT: retq 2583; 2584; AVX-LABEL: sitofp_load_8i32_to_8f32: 2585; AVX: # BB#0: 2586; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0 2587; AVX-NEXT: retq 2588 %ld = load <8 x i32>, <8 x i32> *%a 2589 %cvt = sitofp <8 x i32> %ld to <8 x float> 2590 ret <8 x float> %cvt 2591} 2592 2593define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) { 2594; SSE-LABEL: sitofp_load_8i16_to_8f32: 2595; SSE: # BB#0: 2596; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2597; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2598; SSE-NEXT: psrad $16, %xmm0 2599; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2600; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 2601; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 2602; SSE-NEXT: psrad $16, %xmm1 2603; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 2604; SSE-NEXT: retq 2605; 2606; AVX1-LABEL: sitofp_load_8i16_to_8f32: 2607; AVX1: # BB#0: 2608; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 2609; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 2610; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2611; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 2612; AVX1-NEXT: retq 2613; 2614; AVX2-LABEL: sitofp_load_8i16_to_8f32: 2615; AVX2: # BB#0: 2616; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0 2617; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 2618; AVX2-NEXT: retq 2619 %ld = load <8 x i16>, <8 x i16> *%a 2620 %cvt = sitofp <8 x i16> %ld to <8 x float> 2621 ret <8 x float> %cvt 2622} 2623 2624define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) { 2625; SSE-LABEL: sitofp_load_8i8_to_8f32: 2626; SSE: # BB#0: 2627; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2628; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2629; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2630; SSE-NEXT: psrad $24, %xmm0 2631; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2632; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2633; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2634; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 2635; SSE-NEXT: psrad $24, %xmm1 2636; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 2637; SSE-NEXT: retq 2638; 2639; AVX1-LABEL: sitofp_load_8i8_to_8f32: 2640; AVX1: # BB#0: 2641; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 2642; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 2643; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2644; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 2645; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2646; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 2647; AVX1-NEXT: retq 2648; 2649; AVX2-LABEL: sitofp_load_8i8_to_8f32: 2650; AVX2: # BB#0: 2651; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0 2652; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 2653; AVX2-NEXT: retq 2654 %ld = load <8 x i8>, <8 x i8> *%a 2655 %cvt = sitofp <8 x i8> %ld to <8 x float> 2656 ret <8 x float> %cvt 2657} 2658 2659; 2660; Load Unsigned Integer to Float 2661; 2662 2663define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { 2664; SSE-LABEL: uitofp_load_4i64_to_4f32: 2665; SSE: # BB#0: 2666; SSE-NEXT: movdqa (%rdi), %xmm1 2667; SSE-NEXT: movdqa 16(%rdi), %xmm3 2668; SSE-NEXT: movd %xmm3, %rax 2669; SSE-NEXT: movl %eax, %ecx 2670; SSE-NEXT: andl $1, %ecx 2671; SSE-NEXT: testq %rax, %rax 2672; SSE-NEXT: js .LBB74_1 2673; SSE-NEXT: # BB#2: 2674; SSE-NEXT: cvtsi2ssq %rax, %xmm2 2675; SSE-NEXT: jmp .LBB74_3 2676; SSE-NEXT: .LBB74_1: 2677; SSE-NEXT: shrq %rax 2678; SSE-NEXT: orq %rax, %rcx 2679; SSE-NEXT: cvtsi2ssq %rcx, %xmm2 2680; SSE-NEXT: addss %xmm2, %xmm2 2681; SSE-NEXT: .LBB74_3: 2682; SSE-NEXT: movd %xmm1, %rax 2683; SSE-NEXT: movl %eax, %ecx 2684; SSE-NEXT: andl $1, %ecx 2685; SSE-NEXT: testq %rax, %rax 2686; SSE-NEXT: js .LBB74_4 2687; SSE-NEXT: # BB#5: 2688; SSE-NEXT: cvtsi2ssq %rax, %xmm0 2689; SSE-NEXT: jmp .LBB74_6 2690; SSE-NEXT: .LBB74_4: 2691; SSE-NEXT: shrq %rax 2692; SSE-NEXT: orq %rax, %rcx 2693; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 2694; SSE-NEXT: addss %xmm0, %xmm0 2695; SSE-NEXT: .LBB74_6: 2696; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2697; SSE-NEXT: movd %xmm3, %rax 2698; SSE-NEXT: movl %eax, %ecx 2699; SSE-NEXT: andl $1, %ecx 2700; SSE-NEXT: testq %rax, %rax 2701; SSE-NEXT: js .LBB74_7 2702; SSE-NEXT: # BB#8: 2703; SSE-NEXT: xorps %xmm3, %xmm3 2704; SSE-NEXT: cvtsi2ssq %rax, %xmm3 2705; SSE-NEXT: jmp .LBB74_9 2706; SSE-NEXT: .LBB74_7: 2707; SSE-NEXT: shrq %rax 2708; SSE-NEXT: orq %rax, %rcx 2709; SSE-NEXT: xorps %xmm3, %xmm3 2710; SSE-NEXT: cvtsi2ssq %rcx, %xmm3 2711; SSE-NEXT: addss %xmm3, %xmm3 2712; SSE-NEXT: .LBB74_9: 2713; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2714; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2715; SSE-NEXT: movd %xmm1, %rax 2716; SSE-NEXT: movl %eax, %ecx 2717; SSE-NEXT: andl $1, %ecx 2718; SSE-NEXT: testq %rax, %rax 2719; SSE-NEXT: js .LBB74_10 2720; SSE-NEXT: # BB#11: 2721; SSE-NEXT: xorps %xmm1, %xmm1 2722; SSE-NEXT: cvtsi2ssq %rax, %xmm1 2723; SSE-NEXT: jmp .LBB74_12 2724; SSE-NEXT: .LBB74_10: 2725; SSE-NEXT: shrq %rax 2726; SSE-NEXT: orq %rax, %rcx 2727; SSE-NEXT: xorps %xmm1, %xmm1 2728; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 2729; SSE-NEXT: addss %xmm1, %xmm1 2730; SSE-NEXT: .LBB74_12: 2731; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 2732; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2733; SSE-NEXT: retq 2734; 2735; AVX1-LABEL: uitofp_load_4i64_to_4f32: 2736; AVX1: # BB#0: 2737; AVX1-NEXT: vmovdqa (%rdi), %ymm0 2738; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2739; AVX1-NEXT: movl %eax, %ecx 2740; AVX1-NEXT: andl $1, %ecx 2741; AVX1-NEXT: testq %rax, %rax 2742; AVX1-NEXT: js .LBB74_1 2743; AVX1-NEXT: # BB#2: 2744; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2745; AVX1-NEXT: jmp .LBB74_3 2746; AVX1-NEXT: .LBB74_1: 2747; AVX1-NEXT: shrq %rax 2748; AVX1-NEXT: orq %rax, %rcx 2749; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 2750; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 2751; AVX1-NEXT: .LBB74_3: 2752; AVX1-NEXT: vmovq %xmm0, %rax 2753; AVX1-NEXT: movl %eax, %ecx 2754; AVX1-NEXT: andl $1, %ecx 2755; AVX1-NEXT: testq %rax, %rax 2756; AVX1-NEXT: js .LBB74_4 2757; AVX1-NEXT: # BB#5: 2758; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2759; AVX1-NEXT: jmp .LBB74_6 2760; AVX1-NEXT: .LBB74_4: 2761; AVX1-NEXT: shrq %rax 2762; AVX1-NEXT: orq %rax, %rcx 2763; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 2764; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 2765; AVX1-NEXT: .LBB74_6: 2766; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 2767; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2768; AVX1-NEXT: vmovq %xmm0, %rax 2769; AVX1-NEXT: movl %eax, %ecx 2770; AVX1-NEXT: andl $1, %ecx 2771; AVX1-NEXT: testq %rax, %rax 2772; AVX1-NEXT: js .LBB74_7 2773; AVX1-NEXT: # BB#8: 2774; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2775; AVX1-NEXT: jmp .LBB74_9 2776; AVX1-NEXT: .LBB74_7: 2777; AVX1-NEXT: shrq %rax 2778; AVX1-NEXT: orq %rax, %rcx 2779; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 2780; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 2781; AVX1-NEXT: .LBB74_9: 2782; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 2783; AVX1-NEXT: vpextrq $1, %xmm0, %rax 2784; AVX1-NEXT: movl %eax, %ecx 2785; AVX1-NEXT: andl $1, %ecx 2786; AVX1-NEXT: testq %rax, %rax 2787; AVX1-NEXT: js .LBB74_10 2788; AVX1-NEXT: # BB#11: 2789; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 2790; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2791; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2792; AVX1-NEXT: vzeroupper 2793; AVX1-NEXT: retq 2794; AVX1-NEXT: .LBB74_10: 2795; AVX1-NEXT: shrq %rax 2796; AVX1-NEXT: orq %rax, %rcx 2797; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 2798; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 2799; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 2800; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2801; AVX1-NEXT: vzeroupper 2802; AVX1-NEXT: retq 2803; 2804; AVX2-LABEL: uitofp_load_4i64_to_4f32: 2805; AVX2: # BB#0: 2806; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2807; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2808; AVX2-NEXT: movl %eax, %ecx 2809; AVX2-NEXT: andl $1, %ecx 2810; AVX2-NEXT: testq %rax, %rax 2811; AVX2-NEXT: js .LBB74_1 2812; AVX2-NEXT: # BB#2: 2813; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 2814; AVX2-NEXT: jmp .LBB74_3 2815; AVX2-NEXT: .LBB74_1: 2816; AVX2-NEXT: shrq %rax 2817; AVX2-NEXT: orq %rax, %rcx 2818; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 2819; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 2820; AVX2-NEXT: .LBB74_3: 2821; AVX2-NEXT: vmovq %xmm0, %rax 2822; AVX2-NEXT: movl %eax, %ecx 2823; AVX2-NEXT: andl $1, %ecx 2824; AVX2-NEXT: testq %rax, %rax 2825; AVX2-NEXT: js .LBB74_4 2826; AVX2-NEXT: # BB#5: 2827; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2828; AVX2-NEXT: jmp .LBB74_6 2829; AVX2-NEXT: .LBB74_4: 2830; AVX2-NEXT: shrq %rax 2831; AVX2-NEXT: orq %rax, %rcx 2832; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 2833; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 2834; AVX2-NEXT: .LBB74_6: 2835; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] 2836; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2837; AVX2-NEXT: vmovq %xmm0, %rax 2838; AVX2-NEXT: movl %eax, %ecx 2839; AVX2-NEXT: andl $1, %ecx 2840; AVX2-NEXT: testq %rax, %rax 2841; AVX2-NEXT: js .LBB74_7 2842; AVX2-NEXT: # BB#8: 2843; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 2844; AVX2-NEXT: jmp .LBB74_9 2845; AVX2-NEXT: .LBB74_7: 2846; AVX2-NEXT: shrq %rax 2847; AVX2-NEXT: orq %rax, %rcx 2848; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 2849; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 2850; AVX2-NEXT: .LBB74_9: 2851; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 2852; AVX2-NEXT: vpextrq $1, %xmm0, %rax 2853; AVX2-NEXT: movl %eax, %ecx 2854; AVX2-NEXT: andl $1, %ecx 2855; AVX2-NEXT: testq %rax, %rax 2856; AVX2-NEXT: js .LBB74_10 2857; AVX2-NEXT: # BB#11: 2858; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 2859; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 2860; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2861; AVX2-NEXT: vzeroupper 2862; AVX2-NEXT: retq 2863; AVX2-NEXT: .LBB74_10: 2864; AVX2-NEXT: shrq %rax 2865; AVX2-NEXT: orq %rax, %rcx 2866; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 2867; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 2868; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 2869; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2870; AVX2-NEXT: vzeroupper 2871; AVX2-NEXT: retq 2872 %ld = load <4 x i64>, <4 x i64> *%a 2873 %cvt = uitofp <4 x i64> %ld to <4 x float> 2874 ret <4 x float> %cvt 2875} 2876 2877define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) { 2878; SSE-LABEL: uitofp_load_4i32_to_4f32: 2879; SSE: # BB#0: 2880; SSE-NEXT: movdqa (%rdi), %xmm0 2881; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 2882; SSE-NEXT: pand %xmm0, %xmm1 2883; SSE-NEXT: por {{.*}}(%rip), %xmm1 2884; SSE-NEXT: psrld $16, %xmm0 2885; SSE-NEXT: por {{.*}}(%rip), %xmm0 2886; SSE-NEXT: addps {{.*}}(%rip), %xmm0 2887; SSE-NEXT: addps %xmm1, %xmm0 2888; SSE-NEXT: retq 2889; 2890; AVX1-LABEL: uitofp_load_4i32_to_4f32: 2891; AVX1: # BB#0: 2892; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2893; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] 2894; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 2895; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] 2896; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 2897; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 2898; AVX1-NEXT: retq 2899; 2900; AVX2-LABEL: uitofp_load_4i32_to_4f32: 2901; AVX2: # BB#0: 2902; AVX2-NEXT: vmovdqa (%rdi), %xmm0 2903; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 2904; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 2905; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 2906; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 2907; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 2908; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 2909; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 2910; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 2911; AVX2-NEXT: retq 2912 %ld = load <4 x i32>, <4 x i32> *%a 2913 %cvt = uitofp <4 x i32> %ld to <4 x float> 2914 ret <4 x float> %cvt 2915} 2916 2917define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) { 2918; SSE-LABEL: uitofp_load_4i16_to_4f32: 2919; SSE: # BB#0: 2920; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2921; SSE-NEXT: pxor %xmm1, %xmm1 2922; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2923; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2924; SSE-NEXT: retq 2925; 2926; AVX-LABEL: uitofp_load_4i16_to_4f32: 2927; AVX: # BB#0: 2928; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2929; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 2930; AVX-NEXT: retq 2931 %ld = load <4 x i16>, <4 x i16> *%a 2932 %cvt = uitofp <4 x i16> %ld to <4 x float> 2933 ret <4 x float> %cvt 2934} 2935 2936define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) { 2937; SSE-LABEL: uitofp_load_4i8_to_4f32: 2938; SSE: # BB#0: 2939; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2940; SSE-NEXT: pxor %xmm1, %xmm1 2941; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2942; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2943; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 2944; SSE-NEXT: retq 2945; 2946; AVX-LABEL: uitofp_load_4i8_to_4f32: 2947; AVX: # BB#0: 2948; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2949; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 2950; AVX-NEXT: retq 2951 %ld = load <4 x i8>, <4 x i8> *%a 2952 %cvt = uitofp <4 x i8> %ld to <4 x float> 2953 ret <4 x float> %cvt 2954} 2955 2956define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { 2957; SSE-LABEL: uitofp_load_8i64_to_8f32: 2958; SSE: # BB#0: 2959; SSE-NEXT: movdqa (%rdi), %xmm1 2960; SSE-NEXT: movdqa 16(%rdi), %xmm5 2961; SSE-NEXT: movdqa 32(%rdi), %xmm2 2962; SSE-NEXT: movdqa 48(%rdi), %xmm3 2963; SSE-NEXT: movd %xmm5, %rax 2964; SSE-NEXT: movl %eax, %ecx 2965; SSE-NEXT: andl $1, %ecx 2966; SSE-NEXT: testq %rax, %rax 2967; SSE-NEXT: js .LBB78_1 2968; SSE-NEXT: # BB#2: 2969; SSE-NEXT: cvtsi2ssq %rax, %xmm4 2970; SSE-NEXT: jmp .LBB78_3 2971; SSE-NEXT: .LBB78_1: 2972; SSE-NEXT: shrq %rax 2973; SSE-NEXT: orq %rax, %rcx 2974; SSE-NEXT: cvtsi2ssq %rcx, %xmm4 2975; SSE-NEXT: addss %xmm4, %xmm4 2976; SSE-NEXT: .LBB78_3: 2977; SSE-NEXT: movd %xmm1, %rax 2978; SSE-NEXT: movl %eax, %ecx 2979; SSE-NEXT: andl $1, %ecx 2980; SSE-NEXT: testq %rax, %rax 2981; SSE-NEXT: js .LBB78_4 2982; SSE-NEXT: # BB#5: 2983; SSE-NEXT: cvtsi2ssq %rax, %xmm0 2984; SSE-NEXT: jmp .LBB78_6 2985; SSE-NEXT: .LBB78_4: 2986; SSE-NEXT: shrq %rax 2987; SSE-NEXT: orq %rax, %rcx 2988; SSE-NEXT: cvtsi2ssq %rcx, %xmm0 2989; SSE-NEXT: addss %xmm0, %xmm0 2990; SSE-NEXT: .LBB78_6: 2991; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] 2992; SSE-NEXT: movd %xmm5, %rax 2993; SSE-NEXT: movl %eax, %ecx 2994; SSE-NEXT: andl $1, %ecx 2995; SSE-NEXT: testq %rax, %rax 2996; SSE-NEXT: js .LBB78_7 2997; SSE-NEXT: # BB#8: 2998; SSE-NEXT: cvtsi2ssq %rax, %xmm6 2999; SSE-NEXT: jmp .LBB78_9 3000; SSE-NEXT: .LBB78_7: 3001; SSE-NEXT: shrq %rax 3002; SSE-NEXT: orq %rax, %rcx 3003; SSE-NEXT: cvtsi2ssq %rcx, %xmm6 3004; SSE-NEXT: addss %xmm6, %xmm6 3005; SSE-NEXT: .LBB78_9: 3006; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 3007; SSE-NEXT: movd %xmm1, %rax 3008; SSE-NEXT: movl %eax, %ecx 3009; SSE-NEXT: andl $1, %ecx 3010; SSE-NEXT: testq %rax, %rax 3011; SSE-NEXT: js .LBB78_10 3012; SSE-NEXT: # BB#11: 3013; SSE-NEXT: xorps %xmm5, %xmm5 3014; SSE-NEXT: cvtsi2ssq %rax, %xmm5 3015; SSE-NEXT: jmp .LBB78_12 3016; SSE-NEXT: .LBB78_10: 3017; SSE-NEXT: shrq %rax 3018; SSE-NEXT: orq %rax, %rcx 3019; SSE-NEXT: xorps %xmm5, %xmm5 3020; SSE-NEXT: cvtsi2ssq %rcx, %xmm5 3021; SSE-NEXT: addss %xmm5, %xmm5 3022; SSE-NEXT: .LBB78_12: 3023; SSE-NEXT: movd %xmm3, %rax 3024; SSE-NEXT: movl %eax, %ecx 3025; SSE-NEXT: andl $1, %ecx 3026; SSE-NEXT: testq %rax, %rax 3027; SSE-NEXT: js .LBB78_13 3028; SSE-NEXT: # BB#14: 3029; SSE-NEXT: cvtsi2ssq %rax, %xmm7 3030; SSE-NEXT: jmp .LBB78_15 3031; SSE-NEXT: .LBB78_13: 3032; SSE-NEXT: shrq %rax 3033; SSE-NEXT: orq %rax, %rcx 3034; SSE-NEXT: cvtsi2ssq %rcx, %xmm7 3035; SSE-NEXT: addss %xmm7, %xmm7 3036; SSE-NEXT: .LBB78_15: 3037; SSE-NEXT: movd %xmm2, %rax 3038; SSE-NEXT: movl %eax, %ecx 3039; SSE-NEXT: andl $1, %ecx 3040; SSE-NEXT: testq %rax, %rax 3041; SSE-NEXT: js .LBB78_16 3042; SSE-NEXT: # BB#17: 3043; SSE-NEXT: xorps %xmm1, %xmm1 3044; SSE-NEXT: cvtsi2ssq %rax, %xmm1 3045; SSE-NEXT: jmp .LBB78_18 3046; SSE-NEXT: .LBB78_16: 3047; SSE-NEXT: shrq %rax 3048; SSE-NEXT: orq %rax, %rcx 3049; SSE-NEXT: xorps %xmm1, %xmm1 3050; SSE-NEXT: cvtsi2ssq %rcx, %xmm1 3051; SSE-NEXT: addss %xmm1, %xmm1 3052; SSE-NEXT: .LBB78_18: 3053; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 3054; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 3055; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 3056; SSE-NEXT: movd %xmm3, %rax 3057; SSE-NEXT: movl %eax, %ecx 3058; SSE-NEXT: andl $1, %ecx 3059; SSE-NEXT: testq %rax, %rax 3060; SSE-NEXT: js .LBB78_19 3061; SSE-NEXT: # BB#20: 3062; SSE-NEXT: xorps %xmm3, %xmm3 3063; SSE-NEXT: cvtsi2ssq %rax, %xmm3 3064; SSE-NEXT: jmp .LBB78_21 3065; SSE-NEXT: .LBB78_19: 3066; SSE-NEXT: shrq %rax 3067; SSE-NEXT: orq %rax, %rcx 3068; SSE-NEXT: xorps %xmm3, %xmm3 3069; SSE-NEXT: cvtsi2ssq %rcx, %xmm3 3070; SSE-NEXT: addss %xmm3, %xmm3 3071; SSE-NEXT: .LBB78_21: 3072; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 3073; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] 3074; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 3075; SSE-NEXT: movd %xmm2, %rax 3076; SSE-NEXT: movl %eax, %ecx 3077; SSE-NEXT: andl $1, %ecx 3078; SSE-NEXT: testq %rax, %rax 3079; SSE-NEXT: js .LBB78_22 3080; SSE-NEXT: # BB#23: 3081; SSE-NEXT: xorps %xmm2, %xmm2 3082; SSE-NEXT: cvtsi2ssq %rax, %xmm2 3083; SSE-NEXT: jmp .LBB78_24 3084; SSE-NEXT: .LBB78_22: 3085; SSE-NEXT: shrq %rax 3086; SSE-NEXT: orq %rax, %rcx 3087; SSE-NEXT: xorps %xmm2, %xmm2 3088; SSE-NEXT: cvtsi2ssq %rcx, %xmm2 3089; SSE-NEXT: addss %xmm2, %xmm2 3090; SSE-NEXT: .LBB78_24: 3091; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 3092; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 3093; SSE-NEXT: retq 3094; 3095; AVX1-LABEL: uitofp_load_8i64_to_8f32: 3096; AVX1: # BB#0: 3097; AVX1-NEXT: vmovdqa (%rdi), %ymm0 3098; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2 3099; AVX1-NEXT: vpextrq $1, %xmm2, %rax 3100; AVX1-NEXT: movl %eax, %ecx 3101; AVX1-NEXT: andl $1, %ecx 3102; AVX1-NEXT: testq %rax, %rax 3103; AVX1-NEXT: js .LBB78_1 3104; AVX1-NEXT: # BB#2: 3105; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 3106; AVX1-NEXT: jmp .LBB78_3 3107; AVX1-NEXT: .LBB78_1: 3108; AVX1-NEXT: shrq %rax 3109; AVX1-NEXT: orq %rax, %rcx 3110; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 3111; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 3112; AVX1-NEXT: .LBB78_3: 3113; AVX1-NEXT: vmovq %xmm2, %rax 3114; AVX1-NEXT: movl %eax, %ecx 3115; AVX1-NEXT: andl $1, %ecx 3116; AVX1-NEXT: testq %rax, %rax 3117; AVX1-NEXT: js .LBB78_4 3118; AVX1-NEXT: # BB#5: 3119; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 3120; AVX1-NEXT: jmp .LBB78_6 3121; AVX1-NEXT: .LBB78_4: 3122; AVX1-NEXT: shrq %rax 3123; AVX1-NEXT: orq %rax, %rcx 3124; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 3125; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 3126; AVX1-NEXT: .LBB78_6: 3127; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 3128; AVX1-NEXT: vmovq %xmm2, %rax 3129; AVX1-NEXT: movl %eax, %ecx 3130; AVX1-NEXT: andl $1, %ecx 3131; AVX1-NEXT: testq %rax, %rax 3132; AVX1-NEXT: js .LBB78_7 3133; AVX1-NEXT: # BB#8: 3134; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4 3135; AVX1-NEXT: jmp .LBB78_9 3136; AVX1-NEXT: .LBB78_7: 3137; AVX1-NEXT: shrq %rax 3138; AVX1-NEXT: orq %rax, %rcx 3139; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4 3140; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 3141; AVX1-NEXT: .LBB78_9: 3142; AVX1-NEXT: vpextrq $1, %xmm2, %rax 3143; AVX1-NEXT: movl %eax, %ecx 3144; AVX1-NEXT: andl $1, %ecx 3145; AVX1-NEXT: testq %rax, %rax 3146; AVX1-NEXT: js .LBB78_10 3147; AVX1-NEXT: # BB#11: 3148; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 3149; AVX1-NEXT: jmp .LBB78_12 3150; AVX1-NEXT: .LBB78_10: 3151; AVX1-NEXT: shrq %rax 3152; AVX1-NEXT: orq %rax, %rcx 3153; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 3154; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 3155; AVX1-NEXT: .LBB78_12: 3156; AVX1-NEXT: vpextrq $1, %xmm0, %rax 3157; AVX1-NEXT: movl %eax, %ecx 3158; AVX1-NEXT: andl $1, %ecx 3159; AVX1-NEXT: testq %rax, %rax 3160; AVX1-NEXT: js .LBB78_13 3161; AVX1-NEXT: # BB#14: 3162; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 3163; AVX1-NEXT: jmp .LBB78_15 3164; AVX1-NEXT: .LBB78_13: 3165; AVX1-NEXT: shrq %rax 3166; AVX1-NEXT: orq %rax, %rcx 3167; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5 3168; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 3169; AVX1-NEXT: .LBB78_15: 3170; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] 3171; AVX1-NEXT: vmovq %xmm0, %rax 3172; AVX1-NEXT: movl %eax, %ecx 3173; AVX1-NEXT: andl $1, %ecx 3174; AVX1-NEXT: testq %rax, %rax 3175; AVX1-NEXT: js .LBB78_16 3176; AVX1-NEXT: # BB#17: 3177; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 3178; AVX1-NEXT: jmp .LBB78_18 3179; AVX1-NEXT: .LBB78_16: 3180; AVX1-NEXT: shrq %rax 3181; AVX1-NEXT: orq %rax, %rcx 3182; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 3183; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 3184; AVX1-NEXT: .LBB78_18: 3185; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] 3186; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] 3187; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 3188; AVX1-NEXT: vmovq %xmm4, %rax 3189; AVX1-NEXT: movl %eax, %ecx 3190; AVX1-NEXT: andl $1, %ecx 3191; AVX1-NEXT: testq %rax, %rax 3192; AVX1-NEXT: js .LBB78_19 3193; AVX1-NEXT: # BB#20: 3194; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 3195; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 3196; AVX1-NEXT: jmp .LBB78_21 3197; AVX1-NEXT: .LBB78_19: 3198; AVX1-NEXT: shrq %rax 3199; AVX1-NEXT: orq %rax, %rcx 3200; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 3201; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 3202; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 3203; AVX1-NEXT: .LBB78_21: 3204; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] 3205; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] 3206; AVX1-NEXT: vpextrq $1, %xmm4, %rax 3207; AVX1-NEXT: movl %eax, %ecx 3208; AVX1-NEXT: andl $1, %ecx 3209; AVX1-NEXT: testq %rax, %rax 3210; AVX1-NEXT: js .LBB78_22 3211; AVX1-NEXT: # BB#23: 3212; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 3213; AVX1-NEXT: jmp .LBB78_24 3214; AVX1-NEXT: .LBB78_22: 3215; AVX1-NEXT: shrq %rax 3216; AVX1-NEXT: orq %rax, %rcx 3217; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 3218; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 3219; AVX1-NEXT: .LBB78_24: 3220; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] 3221; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 3222; AVX1-NEXT: retq 3223; 3224; AVX2-LABEL: uitofp_load_8i64_to_8f32: 3225; AVX2: # BB#0: 3226; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3227; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 3228; AVX2-NEXT: vpextrq $1, %xmm2, %rax 3229; AVX2-NEXT: movl %eax, %ecx 3230; AVX2-NEXT: andl $1, %ecx 3231; AVX2-NEXT: testq %rax, %rax 3232; AVX2-NEXT: js .LBB78_1 3233; AVX2-NEXT: # BB#2: 3234; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 3235; AVX2-NEXT: jmp .LBB78_3 3236; AVX2-NEXT: .LBB78_1: 3237; AVX2-NEXT: shrq %rax 3238; AVX2-NEXT: orq %rax, %rcx 3239; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 3240; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 3241; AVX2-NEXT: .LBB78_3: 3242; AVX2-NEXT: vmovq %xmm2, %rax 3243; AVX2-NEXT: movl %eax, %ecx 3244; AVX2-NEXT: andl $1, %ecx 3245; AVX2-NEXT: testq %rax, %rax 3246; AVX2-NEXT: js .LBB78_4 3247; AVX2-NEXT: # BB#5: 3248; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 3249; AVX2-NEXT: jmp .LBB78_6 3250; AVX2-NEXT: .LBB78_4: 3251; AVX2-NEXT: shrq %rax 3252; AVX2-NEXT: orq %rax, %rcx 3253; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 3254; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 3255; AVX2-NEXT: .LBB78_6: 3256; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 3257; AVX2-NEXT: vmovq %xmm2, %rax 3258; AVX2-NEXT: movl %eax, %ecx 3259; AVX2-NEXT: andl $1, %ecx 3260; AVX2-NEXT: testq %rax, %rax 3261; AVX2-NEXT: js .LBB78_7 3262; AVX2-NEXT: # BB#8: 3263; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4 3264; AVX2-NEXT: jmp .LBB78_9 3265; AVX2-NEXT: .LBB78_7: 3266; AVX2-NEXT: shrq %rax 3267; AVX2-NEXT: orq %rax, %rcx 3268; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4 3269; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 3270; AVX2-NEXT: .LBB78_9: 3271; AVX2-NEXT: vpextrq $1, %xmm2, %rax 3272; AVX2-NEXT: movl %eax, %ecx 3273; AVX2-NEXT: andl $1, %ecx 3274; AVX2-NEXT: testq %rax, %rax 3275; AVX2-NEXT: js .LBB78_10 3276; AVX2-NEXT: # BB#11: 3277; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 3278; AVX2-NEXT: jmp .LBB78_12 3279; AVX2-NEXT: .LBB78_10: 3280; AVX2-NEXT: shrq %rax 3281; AVX2-NEXT: orq %rax, %rcx 3282; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 3283; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 3284; AVX2-NEXT: .LBB78_12: 3285; AVX2-NEXT: vpextrq $1, %xmm0, %rax 3286; AVX2-NEXT: movl %eax, %ecx 3287; AVX2-NEXT: andl $1, %ecx 3288; AVX2-NEXT: testq %rax, %rax 3289; AVX2-NEXT: js .LBB78_13 3290; AVX2-NEXT: # BB#14: 3291; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 3292; AVX2-NEXT: jmp .LBB78_15 3293; AVX2-NEXT: .LBB78_13: 3294; AVX2-NEXT: shrq %rax 3295; AVX2-NEXT: orq %rax, %rcx 3296; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5 3297; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 3298; AVX2-NEXT: .LBB78_15: 3299; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] 3300; AVX2-NEXT: vmovq %xmm0, %rax 3301; AVX2-NEXT: movl %eax, %ecx 3302; AVX2-NEXT: andl $1, %ecx 3303; AVX2-NEXT: testq %rax, %rax 3304; AVX2-NEXT: js .LBB78_16 3305; AVX2-NEXT: # BB#17: 3306; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 3307; AVX2-NEXT: jmp .LBB78_18 3308; AVX2-NEXT: .LBB78_16: 3309; AVX2-NEXT: shrq %rax 3310; AVX2-NEXT: orq %rax, %rcx 3311; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 3312; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 3313; AVX2-NEXT: .LBB78_18: 3314; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] 3315; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] 3316; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 3317; AVX2-NEXT: vmovq %xmm4, %rax 3318; AVX2-NEXT: movl %eax, %ecx 3319; AVX2-NEXT: andl $1, %ecx 3320; AVX2-NEXT: testq %rax, %rax 3321; AVX2-NEXT: js .LBB78_19 3322; AVX2-NEXT: # BB#20: 3323; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 3324; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 3325; AVX2-NEXT: jmp .LBB78_21 3326; AVX2-NEXT: .LBB78_19: 3327; AVX2-NEXT: shrq %rax 3328; AVX2-NEXT: orq %rax, %rcx 3329; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 3330; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 3331; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 3332; AVX2-NEXT: .LBB78_21: 3333; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] 3334; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] 3335; AVX2-NEXT: vpextrq $1, %xmm4, %rax 3336; AVX2-NEXT: movl %eax, %ecx 3337; AVX2-NEXT: andl $1, %ecx 3338; AVX2-NEXT: testq %rax, %rax 3339; AVX2-NEXT: js .LBB78_22 3340; AVX2-NEXT: # BB#23: 3341; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 3342; AVX2-NEXT: jmp .LBB78_24 3343; AVX2-NEXT: .LBB78_22: 3344; AVX2-NEXT: shrq %rax 3345; AVX2-NEXT: orq %rax, %rcx 3346; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 3347; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 3348; AVX2-NEXT: .LBB78_24: 3349; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] 3350; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 3351; AVX2-NEXT: retq 3352 %ld = load <8 x i64>, <8 x i64> *%a 3353 %cvt = uitofp <8 x i64> %ld to <8 x float> 3354 ret <8 x float> %cvt 3355} 3356 3357define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) { 3358; SSE-LABEL: uitofp_load_8i32_to_8f32: 3359; SSE: # BB#0: 3360; SSE-NEXT: movdqa (%rdi), %xmm0 3361; SSE-NEXT: movdqa 16(%rdi), %xmm1 3362; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] 3363; SSE-NEXT: movdqa %xmm0, %xmm3 3364; SSE-NEXT: pand %xmm2, %xmm3 3365; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] 3366; SSE-NEXT: por %xmm4, %xmm3 3367; SSE-NEXT: psrld $16, %xmm0 3368; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] 3369; SSE-NEXT: por %xmm5, %xmm0 3370; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] 3371; SSE-NEXT: addps %xmm6, %xmm0 3372; SSE-NEXT: addps %xmm3, %xmm0 3373; SSE-NEXT: pand %xmm1, %xmm2 3374; SSE-NEXT: por %xmm4, %xmm2 3375; SSE-NEXT: psrld $16, %xmm1 3376; SSE-NEXT: por %xmm5, %xmm1 3377; SSE-NEXT: addps %xmm6, %xmm1 3378; SSE-NEXT: addps %xmm2, %xmm1 3379; SSE-NEXT: retq 3380; 3381; AVX1-LABEL: uitofp_load_8i32_to_8f32: 3382; AVX1: # BB#0: 3383; AVX1-NEXT: vmovaps (%rdi), %ymm0 3384; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 3385; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 3386; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 3387; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 3388; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 3389; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 3390; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 3391; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 3392; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 3393; AVX1-NEXT: retq 3394; 3395; AVX2-LABEL: uitofp_load_8i32_to_8f32: 3396; AVX2: # BB#0: 3397; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3398; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 3399; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 3400; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 3401; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 3402; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 3403; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 3404; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 3405; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 3406; AVX2-NEXT: retq 3407 %ld = load <8 x i32>, <8 x i32> *%a 3408 %cvt = uitofp <8 x i32> %ld to <8 x float> 3409 ret <8 x float> %cvt 3410} 3411 3412define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) { 3413; SSE-LABEL: uitofp_load_8i16_to_8f32: 3414; SSE: # BB#0: 3415; SSE-NEXT: movdqa (%rdi), %xmm1 3416; SSE-NEXT: pxor %xmm2, %xmm2 3417; SSE-NEXT: movdqa %xmm1, %xmm0 3418; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3419; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 3420; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 3421; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 3422; SSE-NEXT: retq 3423; 3424; AVX1-LABEL: uitofp_load_8i16_to_8f32: 3425; AVX1: # BB#0: 3426; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 3427; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 3428; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3429; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 3430; AVX1-NEXT: retq 3431; 3432; AVX2-LABEL: uitofp_load_8i16_to_8f32: 3433; AVX2: # BB#0: 3434; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 3435; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 3436; AVX2-NEXT: retq 3437 %ld = load <8 x i16>, <8 x i16> *%a 3438 %cvt = uitofp <8 x i16> %ld to <8 x float> 3439 ret <8 x float> %cvt 3440} 3441 3442define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) { 3443; SSE-LABEL: uitofp_load_8i8_to_8f32: 3444; SSE: # BB#0: 3445; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 3446; SSE-NEXT: pxor %xmm2, %xmm2 3447; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 3448; SSE-NEXT: movdqa %xmm1, %xmm0 3449; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3450; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 3451; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 3452; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 3453; SSE-NEXT: retq 3454; 3455; AVX1-LABEL: uitofp_load_8i8_to_8f32: 3456; AVX1: # BB#0: 3457; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 3458; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 3459; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3460; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 3461; AVX1-NEXT: retq 3462; 3463; AVX2-LABEL: uitofp_load_8i8_to_8f32: 3464; AVX2: # BB#0: 3465; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 3466; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 3467; AVX2-NEXT: retq 3468 %ld = load <8 x i8>, <8 x i8> *%a 3469 %cvt = uitofp <8 x i8> %ld to <8 x float> 3470 ret <8 x float> %cvt 3471} 3472 3473; 3474; Aggregates 3475; 3476 3477%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }> 3478define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) { 3479; SSE-LABEL: aggregate_sitofp_8i16_to_8f32: 3480; SSE: # BB#0: 3481; SSE-NEXT: movq 24(%rdi), %rax 3482; SSE-NEXT: movdqu 8(%rdi), %xmm0 3483; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3484; SSE-NEXT: psrad $16, %xmm1 3485; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 3486; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 3487; SSE-NEXT: psrad $16, %xmm0 3488; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 3489; SSE-NEXT: movaps %xmm0, 16(%rax) 3490; SSE-NEXT: movaps %xmm1, (%rax) 3491; SSE-NEXT: retq 3492; 3493; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: 3494; AVX1: # BB#0: 3495; AVX1-NEXT: movq 24(%rdi), %rax 3496; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0 3497; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 3498; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 3499; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 3500; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 3501; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 3502; AVX1-NEXT: vmovaps %ymm0, (%rax) 3503; AVX1-NEXT: vzeroupper 3504; AVX1-NEXT: retq 3505; 3506; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32: 3507; AVX2: # BB#0: 3508; AVX2-NEXT: movq 24(%rdi), %rax 3509; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0 3510; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 3511; AVX2-NEXT: vmovaps %ymm0, (%rax) 3512; AVX2-NEXT: vzeroupper 3513; AVX2-NEXT: retq 3514 %1 = load %Arguments, %Arguments* %a0, align 1 3515 %2 = extractvalue %Arguments %1, 1 3516 %3 = extractvalue %Arguments %1, 2 3517 %4 = sitofp <8 x i16> %2 to <8 x float> 3518 store <8 x float> %4, <8 x float>* %3, align 32 3519 ret void 3520} 3521