1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3 4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 7; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW 8; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST 9; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL 10 11define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) { 12; SSE-LABEL: shuffle_v4i32_0001: 13; SSE: # %bb.0: 14; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] 15; SSE-NEXT: retq 16; 17; AVX-LABEL: shuffle_v4i32_0001: 18; AVX: # %bb.0: 19; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] 20; AVX-NEXT: retq 21 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> 22 ret <4 x i32> %shuffle 23} 24define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) { 25; SSE-LABEL: shuffle_v4i32_0020: 26; SSE: # %bb.0: 27; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0] 28; SSE-NEXT: retq 29; 30; AVX-LABEL: shuffle_v4i32_0020: 31; AVX: # %bb.0: 32; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] 33; AVX-NEXT: retq 34 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> 35 ret <4 x i32> %shuffle 36} 37define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) { 38; SSE-LABEL: shuffle_v4i32_0112: 39; SSE: # %bb.0: 40; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] 41; SSE-NEXT: retq 42; 43; AVX-LABEL: shuffle_v4i32_0112: 44; AVX: # %bb.0: 45; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] 46; AVX-NEXT: retq 47 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2> 48 ret <4 x i32> %shuffle 49} 50define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) { 51; SSE-LABEL: shuffle_v4i32_0300: 52; SSE: # %bb.0: 53; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0] 54; SSE-NEXT: retq 55; 56; AVX-LABEL: shuffle_v4i32_0300: 57; AVX: # %bb.0: 58; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] 59; AVX-NEXT: retq 60 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> 61 ret <4 x i32> %shuffle 62} 63define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) { 64; SSE-LABEL: shuffle_v4i32_1000: 65; SSE: # %bb.0: 66; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 67; SSE-NEXT: retq 68; 69; AVX-LABEL: shuffle_v4i32_1000: 70; AVX: # %bb.0: 71; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] 72; AVX-NEXT: retq 73 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 74 ret <4 x i32> %shuffle 75} 76define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) { 77; SSE-LABEL: shuffle_v4i32_2200: 78; SSE: # %bb.0: 79; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0] 80; SSE-NEXT: retq 81; 82; AVX-LABEL: shuffle_v4i32_2200: 83; AVX: # %bb.0: 84; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] 85; AVX-NEXT: retq 86 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> 87 ret <4 x i32> %shuffle 88} 89define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) { 90; SSE-LABEL: shuffle_v4i32_3330: 91; SSE: # %bb.0: 92; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0] 93; SSE-NEXT: retq 94; 95; AVX-LABEL: shuffle_v4i32_3330: 96; AVX: # %bb.0: 97; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] 98; AVX-NEXT: retq 99 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> 100 ret <4 x i32> %shuffle 101} 102define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) { 103; SSE-LABEL: shuffle_v4i32_3210: 104; SSE: # %bb.0: 105; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 106; SSE-NEXT: retq 107; 108; AVX-LABEL: shuffle_v4i32_3210: 109; AVX: # %bb.0: 110; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] 111; AVX-NEXT: retq 112 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 113 ret <4 x i32> %shuffle 114} 115 116define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) { 117; SSE-LABEL: shuffle_v4i32_2121: 118; SSE: # %bb.0: 119; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1] 120; SSE-NEXT: retq 121; 122; AVX-LABEL: shuffle_v4i32_2121: 123; AVX: # %bb.0: 124; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1] 125; AVX-NEXT: retq 126 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1> 127 ret <4 x i32> %shuffle 128} 129 130define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) { 131; SSE-LABEL: shuffle_v4f32_0001: 132; SSE: # %bb.0: 133; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1] 134; SSE-NEXT: retq 135; 136; AVX-LABEL: shuffle_v4f32_0001: 137; AVX: # %bb.0: 138; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] 139; AVX-NEXT: retq 140 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> 141 ret <4 x float> %shuffle 142} 143define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) { 144; SSE-LABEL: shuffle_v4f32_0020: 145; SSE: # %bb.0: 146; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0] 147; SSE-NEXT: retq 148; 149; AVX-LABEL: shuffle_v4f32_0020: 150; AVX: # %bb.0: 151; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] 152; AVX-NEXT: retq 153 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> 154 ret <4 x float> %shuffle 155} 156define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) { 157; SSE-LABEL: shuffle_v4f32_0300: 158; SSE: # %bb.0: 159; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0] 160; SSE-NEXT: retq 161; 162; AVX-LABEL: shuffle_v4f32_0300: 163; AVX: # %bb.0: 164; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] 165; AVX-NEXT: retq 166 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> 167 ret <4 x float> %shuffle 168} 169define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) { 170; SSE-LABEL: shuffle_v4f32_1000: 171; SSE: # %bb.0: 172; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0] 173; SSE-NEXT: retq 174; 175; AVX-LABEL: shuffle_v4f32_1000: 176; AVX: # %bb.0: 177; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] 178; AVX-NEXT: retq 179 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 180 ret <4 x float> %shuffle 181} 182define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) { 183; SSE-LABEL: shuffle_v4f32_2200: 184; SSE: # %bb.0: 185; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0] 186; SSE-NEXT: retq 187; 188; AVX-LABEL: shuffle_v4f32_2200: 189; AVX: # %bb.0: 190; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] 191; AVX-NEXT: retq 192 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> 193 ret <4 x float> %shuffle 194} 195define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) { 196; SSE-LABEL: shuffle_v4f32_3330: 197; SSE: # %bb.0: 198; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0] 199; SSE-NEXT: retq 200; 201; AVX-LABEL: shuffle_v4f32_3330: 202; AVX: # %bb.0: 203; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] 204; AVX-NEXT: retq 205 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> 206 ret <4 x float> %shuffle 207} 208define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) { 209; SSE-LABEL: shuffle_v4f32_3210: 210; SSE: # %bb.0: 211; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] 212; SSE-NEXT: retq 213; 214; AVX-LABEL: shuffle_v4f32_3210: 215; AVX: # %bb.0: 216; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] 217; AVX-NEXT: retq 218 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 219 ret <4 x float> %shuffle 220} 221define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) { 222; SSE-LABEL: shuffle_v4f32_0011: 223; SSE: # %bb.0: 224; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] 225; SSE-NEXT: retq 226; 227; AVX-LABEL: shuffle_v4f32_0011: 228; AVX: # %bb.0: 229; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] 230; AVX-NEXT: retq 231 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1> 232 ret <4 x float> %shuffle 233} 234define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) { 235; SSE-LABEL: shuffle_v4f32_2233: 236; SSE: # %bb.0: 237; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] 238; SSE-NEXT: retq 239; 240; AVX-LABEL: shuffle_v4f32_2233: 241; AVX: # %bb.0: 242; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] 243; AVX-NEXT: retq 244 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3> 245 ret <4 x float> %shuffle 246} 247define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) { 248; SSE2-LABEL: shuffle_v4f32_0022: 249; SSE2: # %bb.0: 250; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2] 251; SSE2-NEXT: retq 252; 253; SSE3-LABEL: shuffle_v4f32_0022: 254; SSE3: # %bb.0: 255; SSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 256; SSE3-NEXT: retq 257; 258; SSSE3-LABEL: shuffle_v4f32_0022: 259; SSSE3: # %bb.0: 260; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 261; SSSE3-NEXT: retq 262; 263; SSE41-LABEL: shuffle_v4f32_0022: 264; SSE41: # %bb.0: 265; SSE41-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 266; SSE41-NEXT: retq 267; 268; AVX-LABEL: shuffle_v4f32_0022: 269; AVX: # %bb.0: 270; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 271; AVX-NEXT: retq 272 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 273 ret <4 x float> %shuffle 274} 275define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) { 276; SSE2-LABEL: shuffle_v4f32_1133: 277; SSE2: # %bb.0: 278; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] 279; SSE2-NEXT: retq 280; 281; SSE3-LABEL: shuffle_v4f32_1133: 282; SSE3: # %bb.0: 283; SSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 284; SSE3-NEXT: retq 285; 286; SSSE3-LABEL: shuffle_v4f32_1133: 287; SSSE3: # %bb.0: 288; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 289; SSSE3-NEXT: retq 290; 291; SSE41-LABEL: shuffle_v4f32_1133: 292; SSE41: # %bb.0: 293; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 294; SSE41-NEXT: retq 295; 296; AVX-LABEL: shuffle_v4f32_1133: 297; AVX: # %bb.0: 298; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 299; AVX-NEXT: retq 300 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 301 ret <4 x float> %shuffle 302} 303 304define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) { 305; SSE-LABEL: shuffle_v4f32_0145: 306; SSE: # %bb.0: 307; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 308; SSE-NEXT: retq 309; 310; AVX-LABEL: shuffle_v4f32_0145: 311; AVX: # %bb.0: 312; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 313; AVX-NEXT: retq 314 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 315 ret <4 x float> %shuffle 316} 317 318define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) { 319; SSE-LABEL: shuffle_v4f32_6723: 320; SSE: # %bb.0: 321; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 322; SSE-NEXT: retq 323; 324; AVX-LABEL: shuffle_v4f32_6723: 325; AVX: # %bb.0: 326; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 327; AVX-NEXT: retq 328 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 329 ret <4 x float> %shuffle 330} 331 332define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { 333; SSE2-LABEL: shuffle_v4i32_0124: 334; SSE2: # %bb.0: 335; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 336; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 337; SSE2-NEXT: retq 338; 339; SSE3-LABEL: shuffle_v4i32_0124: 340; SSE3: # %bb.0: 341; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 342; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 343; SSE3-NEXT: retq 344; 345; SSSE3-LABEL: shuffle_v4i32_0124: 346; SSSE3: # %bb.0: 347; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 348; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 349; SSSE3-NEXT: retq 350; 351; SSE41-LABEL: shuffle_v4i32_0124: 352; SSE41: # %bb.0: 353; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 354; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 355; SSE41-NEXT: retq 356; 357; AVX1-LABEL: shuffle_v4i32_0124: 358; AVX1: # %bb.0: 359; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] 360; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 361; AVX1-NEXT: retq 362; 363; AVX2-LABEL: shuffle_v4i32_0124: 364; AVX2: # %bb.0: 365; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 366; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 367; AVX2-NEXT: retq 368; 369; AVX512VL-LABEL: shuffle_v4i32_0124: 370; AVX512VL: # %bb.0: 371; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,4] 372; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 373; AVX512VL-NEXT: retq 374 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 375 ret <4 x i32> %shuffle 376} 377define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) { 378; SSE2-LABEL: shuffle_v4i32_0142: 379; SSE2: # %bb.0: 380; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 381; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 382; SSE2-NEXT: retq 383; 384; SSE3-LABEL: shuffle_v4i32_0142: 385; SSE3: # %bb.0: 386; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 387; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 388; SSE3-NEXT: retq 389; 390; SSSE3-LABEL: shuffle_v4i32_0142: 391; SSSE3: # %bb.0: 392; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 393; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 394; SSSE3-NEXT: retq 395; 396; SSE41-LABEL: shuffle_v4i32_0142: 397; SSE41: # %bb.0: 398; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 399; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 400; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] 401; SSE41-NEXT: retq 402; 403; AVX1-LABEL: shuffle_v4i32_0142: 404; AVX1: # %bb.0: 405; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] 406; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] 407; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 408; AVX1-NEXT: retq 409; 410; AVX2-LABEL: shuffle_v4i32_0142: 411; AVX2: # %bb.0: 412; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 413; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] 414; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 415; AVX2-NEXT: retq 416; 417; AVX512VL-LABEL: shuffle_v4i32_0142: 418; AVX512VL: # %bb.0: 419; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,2] 420; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 421; AVX512VL-NEXT: retq 422 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 423 ret <4 x i32> %shuffle 424} 425define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) { 426; SSE2-LABEL: shuffle_v4i32_0412: 427; SSE2: # %bb.0: 428; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 429; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] 430; SSE2-NEXT: movaps %xmm1, %xmm0 431; SSE2-NEXT: retq 432; 433; SSE3-LABEL: shuffle_v4i32_0412: 434; SSE3: # %bb.0: 435; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 436; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] 437; SSE3-NEXT: movaps %xmm1, %xmm0 438; SSE3-NEXT: retq 439; 440; SSSE3-LABEL: shuffle_v4i32_0412: 441; SSSE3: # %bb.0: 442; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 443; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] 444; SSSE3-NEXT: movaps %xmm1, %xmm0 445; SSSE3-NEXT: retq 446; 447; SSE41-LABEL: shuffle_v4i32_0412: 448; SSE41: # %bb.0: 449; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 450; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] 451; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 452; SSE41-NEXT: retq 453; 454; AVX1-LABEL: shuffle_v4i32_0412: 455; AVX1: # %bb.0: 456; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] 457; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] 458; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 459; AVX1-NEXT: retq 460; 461; AVX2-LABEL: shuffle_v4i32_0412: 462; AVX2: # %bb.0: 463; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 464; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] 465; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 466; AVX2-NEXT: retq 467; 468; AVX512VL-LABEL: shuffle_v4i32_0412: 469; AVX512VL: # %bb.0: 470; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,1,2] 471; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 472; AVX512VL-NEXT: retq 473 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2> 474 ret <4 x i32> %shuffle 475} 476define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) { 477; SSE2-LABEL: shuffle_v4i32_4012: 478; SSE2: # %bb.0: 479; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 480; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] 481; SSE2-NEXT: movaps %xmm1, %xmm0 482; SSE2-NEXT: retq 483; 484; SSE3-LABEL: shuffle_v4i32_4012: 485; SSE3: # %bb.0: 486; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 487; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] 488; SSE3-NEXT: movaps %xmm1, %xmm0 489; SSE3-NEXT: retq 490; 491; SSSE3-LABEL: shuffle_v4i32_4012: 492; SSSE3: # %bb.0: 493; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 494; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] 495; SSSE3-NEXT: movaps %xmm1, %xmm0 496; SSSE3-NEXT: retq 497; 498; SSE41-LABEL: shuffle_v4i32_4012: 499; SSE41: # %bb.0: 500; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,2] 501; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 502; SSE41-NEXT: retq 503; 504; AVX1OR2-LABEL: shuffle_v4i32_4012: 505; AVX1OR2: # %bb.0: 506; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2] 507; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 508; AVX1OR2-NEXT: retq 509; 510; AVX512VL-LABEL: shuffle_v4i32_4012: 511; AVX512VL: # %bb.0: 512; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,2] 513; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 514; AVX512VL-NEXT: retq 515 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2> 516 ret <4 x i32> %shuffle 517} 518define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) { 519; SSE-LABEL: shuffle_v4i32_0145: 520; SSE: # %bb.0: 521; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 522; SSE-NEXT: retq 523; 524; AVX-LABEL: shuffle_v4i32_0145: 525; AVX: # %bb.0: 526; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 527; AVX-NEXT: retq 528 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 529 ret <4 x i32> %shuffle 530} 531define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) { 532; SSE2-LABEL: shuffle_v4i32_0451: 533; SSE2: # %bb.0: 534; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 535; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] 536; SSE2-NEXT: retq 537; 538; SSE3-LABEL: shuffle_v4i32_0451: 539; SSE3: # %bb.0: 540; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 541; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] 542; SSE3-NEXT: retq 543; 544; SSSE3-LABEL: shuffle_v4i32_0451: 545; SSSE3: # %bb.0: 546; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 547; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] 548; SSSE3-NEXT: retq 549; 550; SSE41-LABEL: shuffle_v4i32_0451: 551; SSE41: # %bb.0: 552; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 553; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 554; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] 555; SSE41-NEXT: retq 556; 557; AVX1-LABEL: shuffle_v4i32_0451: 558; AVX1: # %bb.0: 559; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] 560; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 561; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 562; AVX1-NEXT: retq 563; 564; AVX2-LABEL: shuffle_v4i32_0451: 565; AVX2: # %bb.0: 566; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] 567; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 568; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 569; AVX2-NEXT: retq 570; 571; AVX512VL-LABEL: shuffle_v4i32_0451: 572; AVX512VL: # %bb.0: 573; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,5,1] 574; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 575; AVX512VL-NEXT: retq 576 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1> 577 ret <4 x i32> %shuffle 578} 579define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) { 580; SSE-LABEL: shuffle_v4i32_4501: 581; SSE: # %bb.0: 582; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 583; SSE-NEXT: movaps %xmm1, %xmm0 584; SSE-NEXT: retq 585; 586; AVX-LABEL: shuffle_v4i32_4501: 587; AVX: # %bb.0: 588; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 589; AVX-NEXT: retq 590 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 591 ret <4 x i32> %shuffle 592} 593define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) { 594; SSE2-LABEL: shuffle_v4i32_4015: 595; SSE2: # %bb.0: 596; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 597; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] 598; SSE2-NEXT: retq 599; 600; SSE3-LABEL: shuffle_v4i32_4015: 601; SSE3: # %bb.0: 602; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 603; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] 604; SSE3-NEXT: retq 605; 606; SSSE3-LABEL: shuffle_v4i32_4015: 607; SSSE3: # %bb.0: 608; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 609; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] 610; SSSE3-NEXT: retq 611; 612; SSE41-LABEL: shuffle_v4i32_4015: 613; SSE41: # %bb.0: 614; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 615; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 616; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 617; SSE41-NEXT: retq 618; 619; AVX1-LABEL: shuffle_v4i32_4015: 620; AVX1: # %bb.0: 621; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] 622; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] 623; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 624; AVX1-NEXT: retq 625; 626; AVX2-LABEL: shuffle_v4i32_4015: 627; AVX2: # %bb.0: 628; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] 629; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] 630; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 631; AVX2-NEXT: retq 632; 633; AVX512VL-LABEL: shuffle_v4i32_4015: 634; AVX512VL: # %bb.0: 635; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,5] 636; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 637; AVX512VL-NEXT: retq 638 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5> 639 ret <4 x i32> %shuffle 640} 641 642define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { 643; SSE2-LABEL: shuffle_v4f32_4zzz: 644; SSE2: # %bb.0: 645; SSE2-NEXT: xorps %xmm1, %xmm1 646; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 647; SSE2-NEXT: movaps %xmm1, %xmm0 648; SSE2-NEXT: retq 649; 650; SSE3-LABEL: shuffle_v4f32_4zzz: 651; SSE3: # %bb.0: 652; SSE3-NEXT: xorps %xmm1, %xmm1 653; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 654; SSE3-NEXT: movaps %xmm1, %xmm0 655; SSE3-NEXT: retq 656; 657; SSSE3-LABEL: shuffle_v4f32_4zzz: 658; SSSE3: # %bb.0: 659; SSSE3-NEXT: xorps %xmm1, %xmm1 660; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 661; SSSE3-NEXT: movaps %xmm1, %xmm0 662; SSSE3-NEXT: retq 663; 664; SSE41-LABEL: shuffle_v4f32_4zzz: 665; SSE41: # %bb.0: 666; SSE41-NEXT: xorps %xmm1, %xmm1 667; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 668; SSE41-NEXT: retq 669; 670; AVX-LABEL: shuffle_v4f32_4zzz: 671; AVX: # %bb.0: 672; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 673; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 674; AVX-NEXT: retq 675 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 676 ret <4 x float> %shuffle 677} 678 679define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { 680; SSE2-LABEL: shuffle_v4f32_z4zz: 681; SSE2: # %bb.0: 682; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 683; SSE2-NEXT: xorps %xmm1, %xmm1 684; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 685; SSE2-NEXT: retq 686; 687; SSE3-LABEL: shuffle_v4f32_z4zz: 688; SSE3: # %bb.0: 689; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 690; SSE3-NEXT: xorps %xmm1, %xmm1 691; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 692; SSE3-NEXT: retq 693; 694; SSSE3-LABEL: shuffle_v4f32_z4zz: 695; SSSE3: # %bb.0: 696; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 697; SSSE3-NEXT: xorps %xmm1, %xmm1 698; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 699; SSSE3-NEXT: retq 700; 701; SSE41-LABEL: shuffle_v4f32_z4zz: 702; SSE41: # %bb.0: 703; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero 704; SSE41-NEXT: retq 705; 706; AVX-LABEL: shuffle_v4f32_z4zz: 707; AVX: # %bb.0: 708; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero 709; AVX-NEXT: retq 710 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0> 711 ret <4 x float> %shuffle 712} 713 714define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { 715; SSE2-LABEL: shuffle_v4f32_zz4z: 716; SSE2: # %bb.0: 717; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero 718; SSE2-NEXT: pxor %xmm0, %xmm0 719; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 720; SSE2-NEXT: retq 721; 722; SSE3-LABEL: shuffle_v4f32_zz4z: 723; SSE3: # %bb.0: 724; SSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero 725; SSE3-NEXT: pxor %xmm0, %xmm0 726; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 727; SSE3-NEXT: retq 728; 729; SSSE3-LABEL: shuffle_v4f32_zz4z: 730; SSSE3: # %bb.0: 731; SSSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero 732; SSSE3-NEXT: pxor %xmm0, %xmm0 733; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 734; SSSE3-NEXT: retq 735; 736; SSE41-LABEL: shuffle_v4f32_zz4z: 737; SSE41: # %bb.0: 738; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero 739; SSE41-NEXT: retq 740; 741; AVX-LABEL: shuffle_v4f32_zz4z: 742; AVX: # %bb.0: 743; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero 744; AVX-NEXT: retq 745 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0> 746 ret <4 x float> %shuffle 747} 748 749define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) { 750; SSE2-LABEL: shuffle_v4f32_zuu4: 751; SSE2: # %bb.0: 752; SSE2-NEXT: xorps %xmm1, %xmm1 753; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 754; SSE2-NEXT: movaps %xmm1, %xmm0 755; SSE2-NEXT: retq 756; 757; SSE3-LABEL: shuffle_v4f32_zuu4: 758; SSE3: # %bb.0: 759; SSE3-NEXT: xorps %xmm1, %xmm1 760; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 761; SSE3-NEXT: movaps %xmm1, %xmm0 762; SSE3-NEXT: retq 763; 764; SSSE3-LABEL: shuffle_v4f32_zuu4: 765; SSSE3: # %bb.0: 766; SSSE3-NEXT: xorps %xmm1, %xmm1 767; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 768; SSSE3-NEXT: movaps %xmm1, %xmm0 769; SSSE3-NEXT: retq 770; 771; SSE41-LABEL: shuffle_v4f32_zuu4: 772; SSE41: # %bb.0: 773; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0] 774; SSE41-NEXT: retq 775; 776; AVX-LABEL: shuffle_v4f32_zuu4: 777; AVX: # %bb.0: 778; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0] 779; AVX-NEXT: retq 780 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4> 781 ret <4 x float> %shuffle 782} 783 784define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) { 785; SSE2-LABEL: shuffle_v4f32_zzz7: 786; SSE2: # %bb.0: 787; SSE2-NEXT: xorps %xmm1, %xmm1 788; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 789; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 790; SSE2-NEXT: movaps %xmm1, %xmm0 791; SSE2-NEXT: retq 792; 793; SSE3-LABEL: shuffle_v4f32_zzz7: 794; SSE3: # %bb.0: 795; SSE3-NEXT: xorps %xmm1, %xmm1 796; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 797; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 798; SSE3-NEXT: movaps %xmm1, %xmm0 799; SSE3-NEXT: retq 800; 801; SSSE3-LABEL: shuffle_v4f32_zzz7: 802; SSSE3: # %bb.0: 803; SSSE3-NEXT: xorps %xmm1, %xmm1 804; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 805; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 806; SSSE3-NEXT: movaps %xmm1, %xmm0 807; SSSE3-NEXT: retq 808; 809; SSE41-LABEL: shuffle_v4f32_zzz7: 810; SSE41: # %bb.0: 811; SSE41-NEXT: xorps %xmm1, %xmm1 812; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 813; SSE41-NEXT: retq 814; 815; AVX-LABEL: shuffle_v4f32_zzz7: 816; AVX: # %bb.0: 817; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 818; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 819; AVX-NEXT: retq 820 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 821 ret <4 x float> %shuffle 822} 823 824define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { 825; SSE2-LABEL: shuffle_v4f32_z6zz: 826; SSE2: # %bb.0: 827; SSE2-NEXT: xorps %xmm1, %xmm1 828; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 829; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 830; SSE2-NEXT: retq 831; 832; SSE3-LABEL: shuffle_v4f32_z6zz: 833; SSE3: # %bb.0: 834; SSE3-NEXT: xorps %xmm1, %xmm1 835; SSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 836; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 837; SSE3-NEXT: retq 838; 839; SSSE3-LABEL: shuffle_v4f32_z6zz: 840; SSSE3: # %bb.0: 841; SSSE3-NEXT: xorps %xmm1, %xmm1 842; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 843; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 844; SSSE3-NEXT: retq 845; 846; SSE41-LABEL: shuffle_v4f32_z6zz: 847; SSE41: # %bb.0: 848; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero 849; SSE41-NEXT: retq 850; 851; AVX-LABEL: shuffle_v4f32_z6zz: 852; AVX: # %bb.0: 853; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero 854; AVX-NEXT: retq 855 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 856 ret <4 x float> %shuffle 857} 858 859define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) { 860; SSE2-LABEL: shuffle_v4f32_0z23: 861; SSE2: # %bb.0: 862; SSE2-NEXT: xorps %xmm1, %xmm1 863; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 864; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 865; SSE2-NEXT: movaps %xmm1, %xmm0 866; SSE2-NEXT: retq 867; 868; SSE3-LABEL: shuffle_v4f32_0z23: 869; SSE3: # %bb.0: 870; SSE3-NEXT: xorps %xmm1, %xmm1 871; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 872; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 873; SSE3-NEXT: movaps %xmm1, %xmm0 874; SSE3-NEXT: retq 875; 876; SSSE3-LABEL: shuffle_v4f32_0z23: 877; SSSE3: # %bb.0: 878; SSSE3-NEXT: xorps %xmm1, %xmm1 879; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 880; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 881; SSSE3-NEXT: movaps %xmm1, %xmm0 882; SSSE3-NEXT: retq 883; 884; SSE41-LABEL: shuffle_v4f32_0z23: 885; SSE41: # %bb.0: 886; SSE41-NEXT: xorps %xmm1, %xmm1 887; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 888; SSE41-NEXT: retq 889; 890; AVX-LABEL: shuffle_v4f32_0z23: 891; AVX: # %bb.0: 892; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 893; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 894; AVX-NEXT: retq 895 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 896 ret <4 x float> %shuffle 897} 898 899define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) { 900; SSE2-LABEL: shuffle_v4f32_01z3: 901; SSE2: # %bb.0: 902; SSE2-NEXT: xorps %xmm1, %xmm1 903; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 904; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 905; SSE2-NEXT: retq 906; 907; SSE3-LABEL: shuffle_v4f32_01z3: 908; SSE3: # %bb.0: 909; SSE3-NEXT: xorps %xmm1, %xmm1 910; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 911; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 912; SSE3-NEXT: retq 913; 914; SSSE3-LABEL: shuffle_v4f32_01z3: 915; SSSE3: # %bb.0: 916; SSSE3-NEXT: xorps %xmm1, %xmm1 917; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 918; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 919; SSSE3-NEXT: retq 920; 921; SSE41-LABEL: shuffle_v4f32_01z3: 922; SSE41: # %bb.0: 923; SSE41-NEXT: xorps %xmm1, %xmm1 924; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 925; SSE41-NEXT: retq 926; 927; AVX-LABEL: shuffle_v4f32_01z3: 928; AVX: # %bb.0: 929; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 930; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 931; AVX-NEXT: retq 932 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 933 ret <4 x float> %shuffle 934} 935 936define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) { 937; SSE2-LABEL: shuffle_v4f32_012z: 938; SSE2: # %bb.0: 939; SSE2-NEXT: xorps %xmm1, %xmm1 940; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] 941; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 942; SSE2-NEXT: retq 943; 944; SSE3-LABEL: shuffle_v4f32_012z: 945; SSE3: # %bb.0: 946; SSE3-NEXT: xorps %xmm1, %xmm1 947; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] 948; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 949; SSE3-NEXT: retq 950; 951; SSSE3-LABEL: shuffle_v4f32_012z: 952; SSSE3: # %bb.0: 953; SSSE3-NEXT: xorps %xmm1, %xmm1 954; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] 955; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 956; SSSE3-NEXT: retq 957; 958; SSE41-LABEL: shuffle_v4f32_012z: 959; SSE41: # %bb.0: 960; SSE41-NEXT: xorps %xmm1, %xmm1 961; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 962; SSE41-NEXT: retq 963; 964; AVX-LABEL: shuffle_v4f32_012z: 965; AVX: # %bb.0: 966; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 967; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 968; AVX-NEXT: retq 969 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 970 ret <4 x float> %shuffle 971} 972 973define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) { 974; SSE2-LABEL: shuffle_v4f32_0zz3: 975; SSE2: # %bb.0: 976; SSE2-NEXT: xorps %xmm1, %xmm1 977; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] 978; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 979; SSE2-NEXT: retq 980; 981; SSE3-LABEL: shuffle_v4f32_0zz3: 982; SSE3: # %bb.0: 983; SSE3-NEXT: xorps %xmm1, %xmm1 984; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] 985; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 986; SSE3-NEXT: retq 987; 988; SSSE3-LABEL: shuffle_v4f32_0zz3: 989; SSSE3: # %bb.0: 990; SSSE3-NEXT: xorps %xmm1, %xmm1 991; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] 992; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 993; SSSE3-NEXT: retq 994; 995; SSE41-LABEL: shuffle_v4f32_0zz3: 996; SSE41: # %bb.0: 997; SSE41-NEXT: xorps %xmm1, %xmm1 998; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 999; SSE41-NEXT: retq 1000; 1001; AVX-LABEL: shuffle_v4f32_0zz3: 1002; AVX: # %bb.0: 1003; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1004; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1005; AVX-NEXT: retq 1006 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3> 1007 ret <4 x float> %shuffle 1008} 1009 1010define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) { 1011; SSE2-LABEL: shuffle_v4f32_0z2z: 1012; SSE2: # %bb.0: 1013; SSE2-NEXT: xorps %xmm1, %xmm1 1014; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] 1015; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 1016; SSE2-NEXT: retq 1017; 1018; SSE3-LABEL: shuffle_v4f32_0z2z: 1019; SSE3: # %bb.0: 1020; SSE3-NEXT: xorps %xmm1, %xmm1 1021; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] 1022; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 1023; SSE3-NEXT: retq 1024; 1025; SSSE3-LABEL: shuffle_v4f32_0z2z: 1026; SSSE3: # %bb.0: 1027; SSSE3-NEXT: xorps %xmm1, %xmm1 1028; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] 1029; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 1030; SSSE3-NEXT: retq 1031; 1032; SSE41-LABEL: shuffle_v4f32_0z2z: 1033; SSE41: # %bb.0: 1034; SSE41-NEXT: xorps %xmm1, %xmm1 1035; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1036; SSE41-NEXT: retq 1037; 1038; AVX-LABEL: shuffle_v4f32_0z2z: 1039; AVX: # %bb.0: 1040; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1041; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1042; AVX-NEXT: retq 1043 %shuffle = shufflevector <4 x float> %v, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 2, i32 4> 1044 ret <4 x float> %shuffle 1045} 1046 1047define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) { 1048; SSE-LABEL: shuffle_v4f32_u051: 1049; SSE: # %bb.0: 1050; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1051; SSE-NEXT: movaps %xmm1, %xmm0 1052; SSE-NEXT: retq 1053; 1054; AVX-LABEL: shuffle_v4f32_u051: 1055; AVX: # %bb.0: 1056; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1057; AVX-NEXT: retq 1058 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 5, i32 1> 1059 ret <4 x float> %shuffle 1060} 1061 1062define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) { 1063; SSE2-LABEL: shuffle_v4f32_0zz4: 1064; SSE2: # %bb.0: 1065; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero 1066; SSE2-NEXT: pxor %xmm1, %xmm1 1067; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] 1068; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1069; SSE2-NEXT: movaps %xmm1, %xmm0 1070; SSE2-NEXT: retq 1071; 1072; SSE3-LABEL: shuffle_v4f32_0zz4: 1073; SSE3: # %bb.0: 1074; SSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero 1075; SSE3-NEXT: pxor %xmm1, %xmm1 1076; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] 1077; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1078; SSE3-NEXT: movaps %xmm1, %xmm0 1079; SSE3-NEXT: retq 1080; 1081; SSSE3-LABEL: shuffle_v4f32_0zz4: 1082; SSSE3: # %bb.0: 1083; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero 1084; SSSE3-NEXT: pxor %xmm1, %xmm1 1085; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] 1086; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1087; SSSE3-NEXT: movaps %xmm1, %xmm0 1088; SSSE3-NEXT: retq 1089; 1090; SSE41-LABEL: shuffle_v4f32_0zz4: 1091; SSE41: # %bb.0: 1092; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0] 1093; SSE41-NEXT: retq 1094; 1095; AVX-LABEL: shuffle_v4f32_0zz4: 1096; AVX: # %bb.0: 1097; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0] 1098; AVX-NEXT: retq 1099 %shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0> 1100 %shuffle1 = shufflevector <4 x float> %a, <4 x float> %shuffle, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1101 ret <4 x float> %shuffle1 1102} 1103 1104define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) { 1105; SSE2-LABEL: shuffle_v4f32_0zz6: 1106; SSE2: # %bb.0: 1107; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2] 1108; SSE2-NEXT: xorps %xmm1, %xmm1 1109; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3] 1110; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 1111; SSE2-NEXT: movaps %xmm1, %xmm0 1112; SSE2-NEXT: retq 1113; 1114; SSE3-LABEL: shuffle_v4f32_0zz6: 1115; SSE3: # %bb.0: 1116; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2] 1117; SSE3-NEXT: xorps %xmm1, %xmm1 1118; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3] 1119; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 1120; SSE3-NEXT: movaps %xmm1, %xmm0 1121; SSE3-NEXT: retq 1122; 1123; SSSE3-LABEL: shuffle_v4f32_0zz6: 1124; SSSE3: # %bb.0: 1125; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2] 1126; SSSE3-NEXT: xorps %xmm1, %xmm1 1127; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3] 1128; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 1129; SSSE3-NEXT: movaps %xmm1, %xmm0 1130; SSSE3-NEXT: retq 1131; 1132; SSE41-LABEL: shuffle_v4f32_0zz6: 1133; SSE41: # %bb.0: 1134; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2] 1135; SSE41-NEXT: retq 1136; 1137; AVX-LABEL: shuffle_v4f32_0zz6: 1138; AVX: # %bb.0: 1139; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2] 1140; AVX-NEXT: retq 1141 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6> 1142 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7> 1143 ret <4 x float> %shuffle1 1144} 1145 1146define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) { 1147; SSE2-LABEL: shuffle_v4f32_0z24: 1148; SSE2: # %bb.0: 1149; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 1150; SSE2-NEXT: xorps %xmm2, %xmm2 1151; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 1152; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] 1153; SSE2-NEXT: movaps %xmm2, %xmm0 1154; SSE2-NEXT: retq 1155; 1156; SSE3-LABEL: shuffle_v4f32_0z24: 1157; SSE3: # %bb.0: 1158; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 1159; SSE3-NEXT: xorps %xmm2, %xmm2 1160; SSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 1161; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] 1162; SSE3-NEXT: movaps %xmm2, %xmm0 1163; SSE3-NEXT: retq 1164; 1165; SSSE3-LABEL: shuffle_v4f32_0z24: 1166; SSSE3: # %bb.0: 1167; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 1168; SSSE3-NEXT: xorps %xmm2, %xmm2 1169; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 1170; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] 1171; SSSE3-NEXT: movaps %xmm2, %xmm0 1172; SSSE3-NEXT: retq 1173; 1174; SSE41-LABEL: shuffle_v4f32_0z24: 1175; SSE41: # %bb.0: 1176; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0] 1177; SSE41-NEXT: retq 1178; 1179; AVX-LABEL: shuffle_v4f32_0z24: 1180; AVX: # %bb.0: 1181; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0] 1182; AVX-NEXT: retq 1183 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4> 1184 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1185 ret <4 x float> %shuffle1 1186} 1187 1188define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) { 1189; SSE2-LABEL: shuffle_v4i32_4zzz: 1190; SSE2: # %bb.0: 1191; SSE2-NEXT: xorps %xmm1, %xmm1 1192; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1193; SSE2-NEXT: movaps %xmm1, %xmm0 1194; SSE2-NEXT: retq 1195; 1196; SSE3-LABEL: shuffle_v4i32_4zzz: 1197; SSE3: # %bb.0: 1198; SSE3-NEXT: xorps %xmm1, %xmm1 1199; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1200; SSE3-NEXT: movaps %xmm1, %xmm0 1201; SSE3-NEXT: retq 1202; 1203; SSSE3-LABEL: shuffle_v4i32_4zzz: 1204; SSSE3: # %bb.0: 1205; SSSE3-NEXT: xorps %xmm1, %xmm1 1206; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1207; SSSE3-NEXT: movaps %xmm1, %xmm0 1208; SSSE3-NEXT: retq 1209; 1210; SSE41-LABEL: shuffle_v4i32_4zzz: 1211; SSE41: # %bb.0: 1212; SSE41-NEXT: xorps %xmm1, %xmm1 1213; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1214; SSE41-NEXT: retq 1215; 1216; AVX-LABEL: shuffle_v4i32_4zzz: 1217; AVX: # %bb.0: 1218; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1219; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1220; AVX-NEXT: retq 1221 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1222 ret <4 x i32> %shuffle 1223} 1224 1225define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) { 1226; SSE2-LABEL: shuffle_v4i32_z4zz: 1227; SSE2: # %bb.0: 1228; SSE2-NEXT: xorps %xmm1, %xmm1 1229; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1230; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] 1231; SSE2-NEXT: retq 1232; 1233; SSE3-LABEL: shuffle_v4i32_z4zz: 1234; SSE3: # %bb.0: 1235; SSE3-NEXT: xorps %xmm1, %xmm1 1236; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1237; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] 1238; SSE3-NEXT: retq 1239; 1240; SSSE3-LABEL: shuffle_v4i32_z4zz: 1241; SSSE3: # %bb.0: 1242; SSSE3-NEXT: xorps %xmm1, %xmm1 1243; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1244; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] 1245; SSSE3-NEXT: retq 1246; 1247; SSE41-LABEL: shuffle_v4i32_z4zz: 1248; SSE41: # %bb.0: 1249; SSE41-NEXT: pxor %xmm1, %xmm1 1250; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1251; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] 1252; SSE41-NEXT: retq 1253; 1254; AVX1-LABEL: shuffle_v4i32_z4zz: 1255; AVX1: # %bb.0: 1256; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1257; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1258; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] 1259; AVX1-NEXT: retq 1260; 1261; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz: 1262; AVX2-SLOW: # %bb.0: 1263; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 1264; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1265; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] 1266; AVX2-SLOW-NEXT: retq 1267; 1268; AVX2-FAST-LABEL: shuffle_v4i32_z4zz: 1269; AVX2-FAST: # %bb.0: 1270; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero 1271; AVX2-FAST-NEXT: retq 1272; 1273; AVX512VL-LABEL: shuffle_v4i32_z4zz: 1274; AVX512VL: # %bb.0: 1275; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero 1276; AVX512VL-NEXT: retq 1277 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0> 1278 ret <4 x i32> %shuffle 1279} 1280 1281define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) { 1282; SSE2-LABEL: shuffle_v4i32_zz4z: 1283; SSE2: # %bb.0: 1284; SSE2-NEXT: xorps %xmm1, %xmm1 1285; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1286; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] 1287; SSE2-NEXT: retq 1288; 1289; SSE3-LABEL: shuffle_v4i32_zz4z: 1290; SSE3: # %bb.0: 1291; SSE3-NEXT: xorps %xmm1, %xmm1 1292; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1293; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] 1294; SSE3-NEXT: retq 1295; 1296; SSSE3-LABEL: shuffle_v4i32_zz4z: 1297; SSSE3: # %bb.0: 1298; SSSE3-NEXT: xorps %xmm1, %xmm1 1299; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1300; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] 1301; SSSE3-NEXT: retq 1302; 1303; SSE41-LABEL: shuffle_v4i32_zz4z: 1304; SSE41: # %bb.0: 1305; SSE41-NEXT: pxor %xmm1, %xmm1 1306; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1307; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] 1308; SSE41-NEXT: retq 1309; 1310; AVX1-LABEL: shuffle_v4i32_zz4z: 1311; AVX1: # %bb.0: 1312; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1313; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1314; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1] 1315; AVX1-NEXT: retq 1316; 1317; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z: 1318; AVX2-SLOW: # %bb.0: 1319; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 1320; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1321; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1] 1322; AVX2-SLOW-NEXT: retq 1323; 1324; AVX2-FAST-LABEL: shuffle_v4i32_zz4z: 1325; AVX2-FAST: # %bb.0: 1326; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 1327; AVX2-FAST-NEXT: retq 1328; 1329; AVX512VL-LABEL: shuffle_v4i32_zz4z: 1330; AVX512VL: # %bb.0: 1331; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 1332; AVX512VL-NEXT: retq 1333 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0> 1334 ret <4 x i32> %shuffle 1335} 1336 1337define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) { 1338; SSE-LABEL: shuffle_v4i32_zuu4: 1339; SSE: # %bb.0: 1340; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] 1341; SSE-NEXT: retq 1342; 1343; AVX-LABEL: shuffle_v4i32_zuu4: 1344; AVX: # %bb.0: 1345; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] 1346; AVX-NEXT: retq 1347 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4> 1348 ret <4 x i32> %shuffle 1349} 1350 1351define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) { 1352; SSE2-LABEL: shuffle_v4i32_z6zz: 1353; SSE2: # %bb.0: 1354; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 1355; SSE2-NEXT: xorps %xmm1, %xmm1 1356; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1357; SSE2-NEXT: retq 1358; 1359; SSE3-LABEL: shuffle_v4i32_z6zz: 1360; SSE3: # %bb.0: 1361; SSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 1362; SSE3-NEXT: xorps %xmm1, %xmm1 1363; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1364; SSE3-NEXT: retq 1365; 1366; SSSE3-LABEL: shuffle_v4i32_z6zz: 1367; SSSE3: # %bb.0: 1368; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 1369; SSSE3-NEXT: xorps %xmm1, %xmm1 1370; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1371; SSSE3-NEXT: retq 1372; 1373; SSE41-LABEL: shuffle_v4i32_z6zz: 1374; SSE41: # %bb.0: 1375; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 1376; SSE41-NEXT: pxor %xmm0, %xmm0 1377; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1378; SSE41-NEXT: retq 1379; 1380; AVX1-LABEL: shuffle_v4i32_z6zz: 1381; AVX1: # %bb.0: 1382; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] 1383; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1384; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1385; AVX1-NEXT: retq 1386; 1387; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz: 1388; AVX2-SLOW: # %bb.0: 1389; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] 1390; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 1391; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1392; AVX2-SLOW-NEXT: retq 1393; 1394; AVX2-FAST-LABEL: shuffle_v4i32_z6zz: 1395; AVX2-FAST: # %bb.0: 1396; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero 1397; AVX2-FAST-NEXT: retq 1398; 1399; AVX512VL-LABEL: shuffle_v4i32_z6zz: 1400; AVX512VL: # %bb.0: 1401; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero 1402; AVX512VL-NEXT: retq 1403 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 1404 ret <4 x i32> %shuffle 1405} 1406 1407define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) { 1408; SSE2-LABEL: shuffle_v4i32_7012: 1409; SSE2: # %bb.0: 1410; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0] 1411; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] 1412; SSE2-NEXT: movaps %xmm1, %xmm0 1413; SSE2-NEXT: retq 1414; 1415; SSE3-LABEL: shuffle_v4i32_7012: 1416; SSE3: # %bb.0: 1417; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0] 1418; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] 1419; SSE3-NEXT: movaps %xmm1, %xmm0 1420; SSE3-NEXT: retq 1421; 1422; SSSE3-LABEL: shuffle_v4i32_7012: 1423; SSSE3: # %bb.0: 1424; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] 1425; SSSE3-NEXT: retq 1426; 1427; SSE41-LABEL: shuffle_v4i32_7012: 1428; SSE41: # %bb.0: 1429; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] 1430; SSE41-NEXT: retq 1431; 1432; AVX-LABEL: shuffle_v4i32_7012: 1433; AVX: # %bb.0: 1434; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] 1435; AVX-NEXT: retq 1436 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2> 1437 ret <4 x i32> %shuffle 1438} 1439 1440define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) { 1441; SSE2-LABEL: shuffle_v4i32_6701: 1442; SSE2: # %bb.0: 1443; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 1444; SSE2-NEXT: movaps %xmm1, %xmm0 1445; SSE2-NEXT: retq 1446; 1447; SSE3-LABEL: shuffle_v4i32_6701: 1448; SSE3: # %bb.0: 1449; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 1450; SSE3-NEXT: movaps %xmm1, %xmm0 1451; SSE3-NEXT: retq 1452; 1453; SSSE3-LABEL: shuffle_v4i32_6701: 1454; SSSE3: # %bb.0: 1455; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 1456; SSSE3-NEXT: retq 1457; 1458; SSE41-LABEL: shuffle_v4i32_6701: 1459; SSE41: # %bb.0: 1460; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 1461; SSE41-NEXT: retq 1462; 1463; AVX-LABEL: shuffle_v4i32_6701: 1464; AVX: # %bb.0: 1465; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 1466; AVX-NEXT: retq 1467 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1468 ret <4 x i32> %shuffle 1469} 1470 1471define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) { 1472; SSE2-LABEL: shuffle_v4i32_5670: 1473; SSE2: # %bb.0: 1474; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1475; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0] 1476; SSE2-NEXT: movaps %xmm1, %xmm0 1477; SSE2-NEXT: retq 1478; 1479; SSE3-LABEL: shuffle_v4i32_5670: 1480; SSE3: # %bb.0: 1481; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1482; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0] 1483; SSE3-NEXT: movaps %xmm1, %xmm0 1484; SSE3-NEXT: retq 1485; 1486; SSSE3-LABEL: shuffle_v4i32_5670: 1487; SSSE3: # %bb.0: 1488; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] 1489; SSSE3-NEXT: retq 1490; 1491; SSE41-LABEL: shuffle_v4i32_5670: 1492; SSE41: # %bb.0: 1493; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] 1494; SSE41-NEXT: retq 1495; 1496; AVX-LABEL: shuffle_v4i32_5670: 1497; AVX: # %bb.0: 1498; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] 1499; AVX-NEXT: retq 1500 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0> 1501 ret <4 x i32> %shuffle 1502} 1503 1504define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) { 1505; SSE2-LABEL: shuffle_v4i32_1234: 1506; SSE2: # %bb.0: 1507; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 1508; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] 1509; SSE2-NEXT: retq 1510; 1511; SSE3-LABEL: shuffle_v4i32_1234: 1512; SSE3: # %bb.0: 1513; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 1514; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] 1515; SSE3-NEXT: retq 1516; 1517; SSSE3-LABEL: shuffle_v4i32_1234: 1518; SSSE3: # %bb.0: 1519; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] 1520; SSSE3-NEXT: movdqa %xmm1, %xmm0 1521; SSSE3-NEXT: retq 1522; 1523; SSE41-LABEL: shuffle_v4i32_1234: 1524; SSE41: # %bb.0: 1525; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] 1526; SSE41-NEXT: movdqa %xmm1, %xmm0 1527; SSE41-NEXT: retq 1528; 1529; AVX-LABEL: shuffle_v4i32_1234: 1530; AVX: # %bb.0: 1531; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] 1532; AVX-NEXT: retq 1533 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 1534 ret <4 x i32> %shuffle 1535} 1536 1537define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) { 1538; SSE2-LABEL: shuffle_v4i32_2345: 1539; SSE2: # %bb.0: 1540; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] 1541; SSE2-NEXT: retq 1542; 1543; SSE3-LABEL: shuffle_v4i32_2345: 1544; SSE3: # %bb.0: 1545; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] 1546; SSE3-NEXT: retq 1547; 1548; SSSE3-LABEL: shuffle_v4i32_2345: 1549; SSSE3: # %bb.0: 1550; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 1551; SSSE3-NEXT: movdqa %xmm1, %xmm0 1552; SSSE3-NEXT: retq 1553; 1554; SSE41-LABEL: shuffle_v4i32_2345: 1555; SSE41: # %bb.0: 1556; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 1557; SSE41-NEXT: movdqa %xmm1, %xmm0 1558; SSE41-NEXT: retq 1559; 1560; AVX-LABEL: shuffle_v4i32_2345: 1561; AVX: # %bb.0: 1562; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 1563; AVX-NEXT: retq 1564 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 1565 ret <4 x i32> %shuffle 1566} 1567 1568; PR22391 1569define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) { 1570; SSE2-LABEL: shuffle_v4i32_2456: 1571; SSE2: # %bb.0: 1572; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] 1573; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] 1574; SSE2-NEXT: retq 1575; 1576; SSE3-LABEL: shuffle_v4i32_2456: 1577; SSE3: # %bb.0: 1578; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] 1579; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] 1580; SSE3-NEXT: retq 1581; 1582; SSSE3-LABEL: shuffle_v4i32_2456: 1583; SSSE3: # %bb.0: 1584; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1585; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1586; SSSE3-NEXT: movdqa %xmm1, %xmm0 1587; SSSE3-NEXT: retq 1588; 1589; SSE41-LABEL: shuffle_v4i32_2456: 1590; SSE41: # %bb.0: 1591; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1592; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1593; SSE41-NEXT: movdqa %xmm1, %xmm0 1594; SSE41-NEXT: retq 1595; 1596; AVX1OR2-LABEL: shuffle_v4i32_2456: 1597; AVX1OR2: # %bb.0: 1598; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1599; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1600; AVX1OR2-NEXT: retq 1601; 1602; AVX512VL-LABEL: shuffle_v4i32_2456: 1603; AVX512VL: # %bb.0: 1604; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,0,1,2] 1605; AVX512VL-NEXT: vpermi2d %xmm0, %xmm1, %xmm2 1606; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 1607; AVX512VL-NEXT: retq 1608 %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 1609 %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 1610 ret <4 x i32> %s2 1611} 1612 1613define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) { 1614; SSE-LABEL: shuffle_v4i32_40u1: 1615; SSE: # %bb.0: 1616; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1617; SSE-NEXT: movaps %xmm1, %xmm0 1618; SSE-NEXT: retq 1619; 1620; AVX-LABEL: shuffle_v4i32_40u1: 1621; AVX: # %bb.0: 1622; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1623; AVX-NEXT: retq 1624 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1> 1625 ret <4 x i32> %shuffle 1626} 1627 1628define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) { 1629; SSE2-LABEL: shuffle_v4i32_3456: 1630; SSE2: # %bb.0: 1631; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] 1632; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] 1633; SSE2-NEXT: retq 1634; 1635; SSE3-LABEL: shuffle_v4i32_3456: 1636; SSE3: # %bb.0: 1637; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] 1638; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] 1639; SSE3-NEXT: retq 1640; 1641; SSSE3-LABEL: shuffle_v4i32_3456: 1642; SSSE3: # %bb.0: 1643; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1644; SSSE3-NEXT: movdqa %xmm1, %xmm0 1645; SSSE3-NEXT: retq 1646; 1647; SSE41-LABEL: shuffle_v4i32_3456: 1648; SSE41: # %bb.0: 1649; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1650; SSE41-NEXT: movdqa %xmm1, %xmm0 1651; SSE41-NEXT: retq 1652; 1653; AVX-LABEL: shuffle_v4i32_3456: 1654; AVX: # %bb.0: 1655; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1656; AVX-NEXT: retq 1657 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 1658 ret <4 x i32> %shuffle 1659} 1660 1661define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) { 1662; SSE2-LABEL: shuffle_v4i32_0u1u: 1663; SSE2: # %bb.0: 1664; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 1665; SSE2-NEXT: retq 1666; 1667; SSE3-LABEL: shuffle_v4i32_0u1u: 1668; SSE3: # %bb.0: 1669; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 1670; SSE3-NEXT: retq 1671; 1672; SSSE3-LABEL: shuffle_v4i32_0u1u: 1673; SSSE3: # %bb.0: 1674; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 1675; SSSE3-NEXT: retq 1676; 1677; SSE41-LABEL: shuffle_v4i32_0u1u: 1678; SSE41: # %bb.0: 1679; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1680; SSE41-NEXT: retq 1681; 1682; AVX-LABEL: shuffle_v4i32_0u1u: 1683; AVX: # %bb.0: 1684; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1685; AVX-NEXT: retq 1686 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef> 1687 ret <4 x i32> %shuffle 1688} 1689 1690define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) { 1691; SSE2-LABEL: shuffle_v4i32_0z1z: 1692; SSE2: # %bb.0: 1693; SSE2-NEXT: xorps %xmm1, %xmm1 1694; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1695; SSE2-NEXT: retq 1696; 1697; SSE3-LABEL: shuffle_v4i32_0z1z: 1698; SSE3: # %bb.0: 1699; SSE3-NEXT: xorps %xmm1, %xmm1 1700; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1701; SSE3-NEXT: retq 1702; 1703; SSSE3-LABEL: shuffle_v4i32_0z1z: 1704; SSSE3: # %bb.0: 1705; SSSE3-NEXT: xorps %xmm1, %xmm1 1706; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1707; SSSE3-NEXT: retq 1708; 1709; SSE41-LABEL: shuffle_v4i32_0z1z: 1710; SSE41: # %bb.0: 1711; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1712; SSE41-NEXT: retq 1713; 1714; AVX-LABEL: shuffle_v4i32_0z1z: 1715; AVX: # %bb.0: 1716; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1717; AVX-NEXT: retq 1718 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1719 ret <4 x i32> %shuffle 1720} 1721 1722define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) { 1723; SSE-LABEL: shuffle_v4i32_01zu: 1724; SSE: # %bb.0: 1725; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 1726; SSE-NEXT: retq 1727; 1728; AVX-LABEL: shuffle_v4i32_01zu: 1729; AVX: # %bb.0: 1730; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1731; AVX-NEXT: retq 1732 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 7, i32 undef> 1733 ret <4 x i32> %shuffle 1734} 1735 1736define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) { 1737; SSE2-LABEL: shuffle_v4i32_0z23: 1738; SSE2: # %bb.0: 1739; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 1740; SSE2-NEXT: retq 1741; 1742; SSE3-LABEL: shuffle_v4i32_0z23: 1743; SSE3: # %bb.0: 1744; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 1745; SSE3-NEXT: retq 1746; 1747; SSSE3-LABEL: shuffle_v4i32_0z23: 1748; SSSE3: # %bb.0: 1749; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 1750; SSSE3-NEXT: retq 1751; 1752; SSE41-LABEL: shuffle_v4i32_0z23: 1753; SSE41: # %bb.0: 1754; SSE41-NEXT: xorps %xmm1, %xmm1 1755; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1756; SSE41-NEXT: retq 1757; 1758; AVX-LABEL: shuffle_v4i32_0z23: 1759; AVX: # %bb.0: 1760; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1761; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1762; AVX-NEXT: retq 1763 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 1764 ret <4 x i32> %shuffle 1765} 1766 1767define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) { 1768; SSE2-LABEL: shuffle_v4i32_01z3: 1769; SSE2: # %bb.0: 1770; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 1771; SSE2-NEXT: retq 1772; 1773; SSE3-LABEL: shuffle_v4i32_01z3: 1774; SSE3: # %bb.0: 1775; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 1776; SSE3-NEXT: retq 1777; 1778; SSSE3-LABEL: shuffle_v4i32_01z3: 1779; SSSE3: # %bb.0: 1780; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 1781; SSSE3-NEXT: retq 1782; 1783; SSE41-LABEL: shuffle_v4i32_01z3: 1784; SSE41: # %bb.0: 1785; SSE41-NEXT: xorps %xmm1, %xmm1 1786; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 1787; SSE41-NEXT: retq 1788; 1789; AVX-LABEL: shuffle_v4i32_01z3: 1790; AVX: # %bb.0: 1791; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1792; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 1793; AVX-NEXT: retq 1794 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 1795 ret <4 x i32> %shuffle 1796} 1797 1798define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) { 1799; SSE2-LABEL: shuffle_v4i32_012z: 1800; SSE2: # %bb.0: 1801; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 1802; SSE2-NEXT: retq 1803; 1804; SSE3-LABEL: shuffle_v4i32_012z: 1805; SSE3: # %bb.0: 1806; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 1807; SSE3-NEXT: retq 1808; 1809; SSSE3-LABEL: shuffle_v4i32_012z: 1810; SSSE3: # %bb.0: 1811; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 1812; SSSE3-NEXT: retq 1813; 1814; SSE41-LABEL: shuffle_v4i32_012z: 1815; SSE41: # %bb.0: 1816; SSE41-NEXT: xorps %xmm1, %xmm1 1817; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 1818; SSE41-NEXT: retq 1819; 1820; AVX-LABEL: shuffle_v4i32_012z: 1821; AVX: # %bb.0: 1822; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1823; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 1824; AVX-NEXT: retq 1825 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1826 ret <4 x i32> %shuffle 1827} 1828 1829define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) { 1830; SSE2-LABEL: shuffle_v4i32_0zz3: 1831; SSE2: # %bb.0: 1832; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 1833; SSE2-NEXT: retq 1834; 1835; SSE3-LABEL: shuffle_v4i32_0zz3: 1836; SSE3: # %bb.0: 1837; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 1838; SSE3-NEXT: retq 1839; 1840; SSSE3-LABEL: shuffle_v4i32_0zz3: 1841; SSSE3: # %bb.0: 1842; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 1843; SSSE3-NEXT: retq 1844; 1845; SSE41-LABEL: shuffle_v4i32_0zz3: 1846; SSE41: # %bb.0: 1847; SSE41-NEXT: xorps %xmm1, %xmm1 1848; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1849; SSE41-NEXT: retq 1850; 1851; AVX-LABEL: shuffle_v4i32_0zz3: 1852; AVX: # %bb.0: 1853; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1854; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1855; AVX-NEXT: retq 1856 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3> 1857 ret <4 x i32> %shuffle 1858} 1859 1860define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) { 1861; SSE-LABEL: shuffle_v4i32_bitcast_0415: 1862; SSE: # %bb.0: 1863; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1864; SSE-NEXT: retq 1865; 1866; AVX-LABEL: shuffle_v4i32_bitcast_0415: 1867; AVX: # %bb.0: 1868; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1869; AVX-NEXT: retq 1870 %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 4> 1871 %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double> 1872 %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> <i32 1, i32 0> 1873 %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32> 1874 ret <4 x i32> %bitcast32 1875} 1876 1877define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) { 1878; SSE-LABEL: shuffle_v4f32_bitcast_4401: 1879; SSE: # %bb.0: 1880; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] 1881; SSE-NEXT: movaps %xmm1, %xmm0 1882; SSE-NEXT: retq 1883; 1884; AVX-LABEL: shuffle_v4f32_bitcast_4401: 1885; AVX: # %bb.0: 1886; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1] 1887; AVX-NEXT: retq 1888 %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1> 1889 %2 = bitcast <4 x i32> %1 to <2 x double> 1890 %3 = bitcast <4 x float> %a to <2 x double> 1891 %4 = shufflevector <2 x double> %2, <2 x double> %3, <2 x i32> <i32 0, i32 2> 1892 %5 = bitcast <2 x double> %4 to <4 x float> 1893 ret <4 x float> %5 1894} 1895 1896define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) { 1897; SSE-LABEL: shuffle_v4f32_bitcast_0045: 1898; SSE: # %bb.0: 1899; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1] 1900; SSE-NEXT: retq 1901; 1902; AVX-LABEL: shuffle_v4f32_bitcast_0045: 1903; AVX: # %bb.0: 1904; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1] 1905; AVX-NEXT: retq 1906 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1> 1907 %2 = bitcast <4 x i32> %b to <4 x float> 1908 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 1, i32 0, i32 4, i32 5> 1909 ret <4 x float> %3 1910} 1911 1912define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) { 1913; SSE2-LABEL: mask_v4f32_4127: 1914; SSE2: # %bb.0: 1915; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] 1916; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1917; SSE2-NEXT: movaps %xmm1, %xmm0 1918; SSE2-NEXT: retq 1919; 1920; SSE3-LABEL: mask_v4f32_4127: 1921; SSE3: # %bb.0: 1922; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] 1923; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1924; SSE3-NEXT: movaps %xmm1, %xmm0 1925; SSE3-NEXT: retq 1926; 1927; SSSE3-LABEL: mask_v4f32_4127: 1928; SSSE3: # %bb.0: 1929; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] 1930; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1931; SSSE3-NEXT: movaps %xmm1, %xmm0 1932; SSSE3-NEXT: retq 1933; 1934; SSE41-LABEL: mask_v4f32_4127: 1935; SSE41: # %bb.0: 1936; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 1937; SSE41-NEXT: retq 1938; 1939; AVX-LABEL: mask_v4f32_4127: 1940; AVX: # %bb.0: 1941; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 1942; AVX-NEXT: retq 1943 %1 = bitcast <4 x float> %a to <4 x i32> 1944 %2 = bitcast <4 x float> %b to <4 x i32> 1945 %3 = and <4 x i32> %1, <i32 0, i32 -1, i32 -1, i32 0> 1946 %4 = and <4 x i32> %2, <i32 -1, i32 0, i32 0, i32 -1> 1947 %5 = or <4 x i32> %4, %3 1948 %6 = bitcast <4 x i32> %5 to <4 x float> 1949 ret <4 x float> %6 1950} 1951 1952define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) { 1953; SSE2-LABEL: mask_v4f32_0127: 1954; SSE2: # %bb.0: 1955; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 1956; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 1957; SSE2-NEXT: movaps %xmm1, %xmm0 1958; SSE2-NEXT: retq 1959; 1960; SSE3-LABEL: mask_v4f32_0127: 1961; SSE3: # %bb.0: 1962; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 1963; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 1964; SSE3-NEXT: movaps %xmm1, %xmm0 1965; SSE3-NEXT: retq 1966; 1967; SSSE3-LABEL: mask_v4f32_0127: 1968; SSSE3: # %bb.0: 1969; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 1970; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 1971; SSSE3-NEXT: movaps %xmm1, %xmm0 1972; SSSE3-NEXT: retq 1973; 1974; SSE41-LABEL: mask_v4f32_0127: 1975; SSE41: # %bb.0: 1976; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 1977; SSE41-NEXT: retq 1978; 1979; AVX-LABEL: mask_v4f32_0127: 1980; AVX: # %bb.0: 1981; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 1982; AVX-NEXT: retq 1983 %1 = bitcast <4 x float> %a to <2 x i64> 1984 %2 = bitcast <4 x float> %b to <2 x i64> 1985 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296> 1986 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295> 1987 %5 = or <2 x i64> %4, %3 1988 %6 = bitcast <2 x i64> %5 to <4 x float> 1989 ret <4 x float> %6 1990} 1991 1992define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) { 1993; SSE2-LABEL: mask_v4i32_0127: 1994; SSE2: # %bb.0: 1995; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 1996; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 1997; SSE2-NEXT: movaps %xmm1, %xmm0 1998; SSE2-NEXT: retq 1999; 2000; SSE3-LABEL: mask_v4i32_0127: 2001; SSE3: # %bb.0: 2002; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 2003; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 2004; SSE3-NEXT: movaps %xmm1, %xmm0 2005; SSE3-NEXT: retq 2006; 2007; SSSE3-LABEL: mask_v4i32_0127: 2008; SSSE3: # %bb.0: 2009; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 2010; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 2011; SSSE3-NEXT: movaps %xmm1, %xmm0 2012; SSSE3-NEXT: retq 2013; 2014; SSE41-LABEL: mask_v4i32_0127: 2015; SSE41: # %bb.0: 2016; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 2017; SSE41-NEXT: retq 2018; 2019; AVX-LABEL: mask_v4i32_0127: 2020; AVX: # %bb.0: 2021; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 2022; AVX-NEXT: retq 2023 %1 = bitcast <4 x i32> %a to <2 x i64> 2024 %2 = bitcast <4 x i32> %b to <2 x i64> 2025 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296> 2026 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295> 2027 %5 = or <2 x i64> %4, %3 2028 %6 = bitcast <2 x i64> %5 to <4 x i32> 2029 ret <4 x i32> %6 2030} 2031 2032define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) { 2033; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32: 2034; SSE2: # %bb.0: 2035; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2036; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2037; SSE2-NEXT: retq 2038; 2039; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32: 2040; SSE3: # %bb.0: 2041; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] 2042; SSE3-NEXT: retq 2043; 2044; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32: 2045; SSSE3: # %bb.0: 2046; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] 2047; SSSE3-NEXT: retq 2048; 2049; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32: 2050; SSE41: # %bb.0: 2051; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] 2052; SSE41-NEXT: retq 2053; 2054; AVX-LABEL: broadcast_v4f32_0101_from_v2f32: 2055; AVX: # %bb.0: 2056; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 2057; AVX-NEXT: retq 2058 %1 = load <2 x float>, <2 x float>* %x, align 1 2059 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 2060 ret <4 x float> %2 2061} 2062 2063define <4 x i32> @extract3_insert0_v4i32_7123(<4 x i32> %a0, <4 x i32> %a1) { 2064; SSE2-LABEL: extract3_insert0_v4i32_7123: 2065; SSE2: # %bb.0: 2066; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 2067; SSE2-NEXT: movd %xmm1, %eax 2068; SSE2-NEXT: movd %eax, %xmm1 2069; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2070; SSE2-NEXT: retq 2071; 2072; SSE3-LABEL: extract3_insert0_v4i32_7123: 2073; SSE3: # %bb.0: 2074; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 2075; SSE3-NEXT: movd %xmm1, %eax 2076; SSE3-NEXT: movd %eax, %xmm1 2077; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2078; SSE3-NEXT: retq 2079; 2080; SSSE3-LABEL: extract3_insert0_v4i32_7123: 2081; SSSE3: # %bb.0: 2082; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 2083; SSSE3-NEXT: movd %xmm1, %eax 2084; SSSE3-NEXT: movd %eax, %xmm1 2085; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2086; SSSE3-NEXT: retq 2087; 2088; SSE41-LABEL: extract3_insert0_v4i32_7123: 2089; SSE41: # %bb.0: 2090; SSE41-NEXT: extractps $3, %xmm1, %eax 2091; SSE41-NEXT: pinsrd $0, %eax, %xmm0 2092; SSE41-NEXT: retq 2093; 2094; AVX-LABEL: extract3_insert0_v4i32_7123: 2095; AVX: # %bb.0: 2096; AVX-NEXT: vextractps $3, %xmm1, %eax 2097; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 2098; AVX-NEXT: retq 2099 %1 = extractelement <4 x i32> %a1, i32 3 2100 %2 = insertelement <4 x i32> %a0, i32 %1, i32 0 2101 ret <4 x i32> %2 2102} 2103 2104define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) { 2105; SSE2-LABEL: extract3_insert3_v4i32_0127: 2106; SSE2: # %bb.0: 2107; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2108; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2109; SSE2-NEXT: retq 2110; 2111; SSE3-LABEL: extract3_insert3_v4i32_0127: 2112; SSE3: # %bb.0: 2113; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2114; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2115; SSE3-NEXT: retq 2116; 2117; SSSE3-LABEL: extract3_insert3_v4i32_0127: 2118; SSSE3: # %bb.0: 2119; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2120; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2121; SSSE3-NEXT: retq 2122; 2123; SSE41-LABEL: extract3_insert3_v4i32_0127: 2124; SSE41: # %bb.0: 2125; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 2126; SSE41-NEXT: retq 2127; 2128; AVX-LABEL: extract3_insert3_v4i32_0127: 2129; AVX: # %bb.0: 2130; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 2131; AVX-NEXT: retq 2132 %1 = extractelement <4 x i32> %a1, i32 3 2133 %2 = insertelement <4 x i32> %a0, i32 %1, i32 3 2134 ret <4 x i32> %2 2135} 2136 2137define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) { 2138; SSE-LABEL: insert_reg_and_zero_v4i32: 2139; SSE: # %bb.0: 2140; SSE-NEXT: movd %edi, %xmm0 2141; SSE-NEXT: retq 2142; 2143; AVX-LABEL: insert_reg_and_zero_v4i32: 2144; AVX: # %bb.0: 2145; AVX-NEXT: vmovd %edi, %xmm0 2146; AVX-NEXT: retq 2147 %v = insertelement <4 x i32> undef, i32 %a, i32 0 2148 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2149 ret <4 x i32> %shuffle 2150} 2151 2152define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) { 2153; SSE-LABEL: insert_mem_and_zero_v4i32: 2154; SSE: # %bb.0: 2155; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2156; SSE-NEXT: retq 2157; 2158; AVX-LABEL: insert_mem_and_zero_v4i32: 2159; AVX: # %bb.0: 2160; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2161; AVX-NEXT: retq 2162 %a = load i32, i32* %ptr 2163 %v = insertelement <4 x i32> undef, i32 %a, i32 0 2164 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2165 ret <4 x i32> %shuffle 2166} 2167 2168define <4 x float> @insert_reg_and_zero_v4f32(float %a) { 2169; SSE2-LABEL: insert_reg_and_zero_v4f32: 2170; SSE2: # %bb.0: 2171; SSE2-NEXT: xorps %xmm1, %xmm1 2172; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2173; SSE2-NEXT: movaps %xmm1, %xmm0 2174; SSE2-NEXT: retq 2175; 2176; SSE3-LABEL: insert_reg_and_zero_v4f32: 2177; SSE3: # %bb.0: 2178; SSE3-NEXT: xorps %xmm1, %xmm1 2179; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2180; SSE3-NEXT: movaps %xmm1, %xmm0 2181; SSE3-NEXT: retq 2182; 2183; SSSE3-LABEL: insert_reg_and_zero_v4f32: 2184; SSSE3: # %bb.0: 2185; SSSE3-NEXT: xorps %xmm1, %xmm1 2186; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2187; SSSE3-NEXT: movaps %xmm1, %xmm0 2188; SSSE3-NEXT: retq 2189; 2190; SSE41-LABEL: insert_reg_and_zero_v4f32: 2191; SSE41: # %bb.0: 2192; SSE41-NEXT: xorps %xmm1, %xmm1 2193; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2194; SSE41-NEXT: retq 2195; 2196; AVX-LABEL: insert_reg_and_zero_v4f32: 2197; AVX: # %bb.0: 2198; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 2199; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2200; AVX-NEXT: retq 2201 %v = insertelement <4 x float> undef, float %a, i32 0 2202 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2203 ret <4 x float> %shuffle 2204} 2205 2206define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) { 2207; SSE-LABEL: insert_mem_and_zero_v4f32: 2208; SSE: # %bb.0: 2209; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2210; SSE-NEXT: retq 2211; 2212; AVX-LABEL: insert_mem_and_zero_v4f32: 2213; AVX: # %bb.0: 2214; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2215; AVX-NEXT: retq 2216 %a = load float, float* %ptr 2217 %v = insertelement <4 x float> undef, float %a, i32 0 2218 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2219 ret <4 x float> %shuffle 2220} 2221 2222define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) { 2223; SSE2-LABEL: insert_reg_lo_v4i32: 2224; SSE2: # %bb.0: 2225; SSE2-NEXT: movq %rdi, %xmm1 2226; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2227; SSE2-NEXT: retq 2228; 2229; SSE3-LABEL: insert_reg_lo_v4i32: 2230; SSE3: # %bb.0: 2231; SSE3-NEXT: movq %rdi, %xmm1 2232; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2233; SSE3-NEXT: retq 2234; 2235; SSSE3-LABEL: insert_reg_lo_v4i32: 2236; SSSE3: # %bb.0: 2237; SSSE3-NEXT: movq %rdi, %xmm1 2238; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2239; SSSE3-NEXT: retq 2240; 2241; SSE41-LABEL: insert_reg_lo_v4i32: 2242; SSE41: # %bb.0: 2243; SSE41-NEXT: movq %rdi, %xmm1 2244; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2245; SSE41-NEXT: retq 2246; 2247; AVX1-LABEL: insert_reg_lo_v4i32: 2248; AVX1: # %bb.0: 2249; AVX1-NEXT: vmovq %rdi, %xmm1 2250; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2251; AVX1-NEXT: retq 2252; 2253; AVX2OR512VL-LABEL: insert_reg_lo_v4i32: 2254; AVX2OR512VL: # %bb.0: 2255; AVX2OR512VL-NEXT: vmovq %rdi, %xmm1 2256; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2257; AVX2OR512VL-NEXT: retq 2258 %a.cast = bitcast i64 %a to <2 x i32> 2259 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2260 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2261 ret <4 x i32> %shuffle 2262} 2263 2264define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { 2265; SSE2-LABEL: insert_mem_lo_v4i32: 2266; SSE2: # %bb.0: 2267; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2268; SSE2-NEXT: retq 2269; 2270; SSE3-LABEL: insert_mem_lo_v4i32: 2271; SSE3: # %bb.0: 2272; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2273; SSE3-NEXT: retq 2274; 2275; SSSE3-LABEL: insert_mem_lo_v4i32: 2276; SSSE3: # %bb.0: 2277; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2278; SSSE3-NEXT: retq 2279; 2280; SSE41-LABEL: insert_mem_lo_v4i32: 2281; SSE41: # %bb.0: 2282; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 2283; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2284; SSE41-NEXT: retq 2285; 2286; AVX-LABEL: insert_mem_lo_v4i32: 2287; AVX: # %bb.0: 2288; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2289; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2290; AVX-NEXT: retq 2291 %a = load <2 x i32>, <2 x i32>* %ptr 2292 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2293 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2294 ret <4 x i32> %shuffle 2295} 2296 2297define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) { 2298; SSE-LABEL: insert_reg_hi_v4i32: 2299; SSE: # %bb.0: 2300; SSE-NEXT: movq %rdi, %xmm1 2301; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2302; SSE-NEXT: retq 2303; 2304; AVX-LABEL: insert_reg_hi_v4i32: 2305; AVX: # %bb.0: 2306; AVX-NEXT: vmovq %rdi, %xmm1 2307; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2308; AVX-NEXT: retq 2309 %a.cast = bitcast i64 %a to <2 x i32> 2310 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2311 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 2312 ret <4 x i32> %shuffle 2313} 2314 2315define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { 2316; SSE-LABEL: insert_mem_hi_v4i32: 2317; SSE: # %bb.0: 2318; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 2319; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2320; SSE-NEXT: retq 2321; 2322; AVX-LABEL: insert_mem_hi_v4i32: 2323; AVX: # %bb.0: 2324; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2325; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2326; AVX-NEXT: retq 2327 %a = load <2 x i32>, <2 x i32>* %ptr 2328 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2329 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 2330 ret <4 x i32> %shuffle 2331} 2332 2333define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) { 2334; SSE2-LABEL: insert_reg_lo_v4f32: 2335; SSE2: # %bb.0: 2336; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2337; SSE2-NEXT: retq 2338; 2339; SSE3-LABEL: insert_reg_lo_v4f32: 2340; SSE3: # %bb.0: 2341; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2342; SSE3-NEXT: retq 2343; 2344; SSSE3-LABEL: insert_reg_lo_v4f32: 2345; SSSE3: # %bb.0: 2346; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2347; SSSE3-NEXT: retq 2348; 2349; SSE41-LABEL: insert_reg_lo_v4f32: 2350; SSE41: # %bb.0: 2351; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2352; SSE41-NEXT: retq 2353; 2354; AVX-LABEL: insert_reg_lo_v4f32: 2355; AVX: # %bb.0: 2356; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2357; AVX-NEXT: retq 2358 %a.cast = bitcast double %a to <2 x float> 2359 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2360 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2361 ret <4 x float> %shuffle 2362} 2363 2364define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { 2365; SSE-LABEL: insert_mem_lo_v4f32: 2366; SSE: # %bb.0: 2367; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2368; SSE-NEXT: retq 2369; 2370; AVX-LABEL: insert_mem_lo_v4f32: 2371; AVX: # %bb.0: 2372; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2373; AVX-NEXT: retq 2374 %a = load <2 x float>, <2 x float>* %ptr 2375 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2376 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2377 ret <4 x float> %shuffle 2378} 2379 2380define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) { 2381; SSE-LABEL: insert_reg_hi_v4f32: 2382; SSE: # %bb.0: 2383; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2384; SSE-NEXT: movaps %xmm1, %xmm0 2385; SSE-NEXT: retq 2386; 2387; AVX-LABEL: insert_reg_hi_v4f32: 2388; AVX: # %bb.0: 2389; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2390; AVX-NEXT: retq 2391 %a.cast = bitcast double %a to <2 x float> 2392 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2393 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 2394 ret <4 x float> %shuffle 2395} 2396 2397define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { 2398; SSE-LABEL: insert_mem_hi_v4f32: 2399; SSE: # %bb.0: 2400; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 2401; SSE-NEXT: retq 2402; 2403; AVX-LABEL: insert_mem_hi_v4f32: 2404; AVX: # %bb.0: 2405; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 2406; AVX-NEXT: retq 2407 %a = load <2 x float>, <2 x float>* %ptr 2408 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2409 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 2410 ret <4 x float> %shuffle 2411} 2412 2413; PR21137 2414define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) { 2415; SSE-LABEL: shuffle_mem_v4f32_3210: 2416; SSE: # %bb.0: 2417; SSE-NEXT: movaps (%rdi), %xmm0 2418; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] 2419; SSE-NEXT: retq 2420; 2421; AVX-LABEL: shuffle_mem_v4f32_3210: 2422; AVX: # %bb.0: 2423; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] 2424; AVX-NEXT: retq 2425 %a = load <4 x float>, <4 x float>* %ptr 2426 %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 2427 ret <4 x float> %shuffle 2428} 2429 2430define <4 x i32> @insert_dup_mem_v4i32(i32* %ptr) { 2431; SSE-LABEL: insert_dup_mem_v4i32: 2432; SSE: # %bb.0: 2433; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2434; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2435; SSE-NEXT: retq 2436; 2437; AVX-LABEL: insert_dup_mem_v4i32: 2438; AVX: # %bb.0: 2439; AVX-NEXT: vbroadcastss (%rdi), %xmm0 2440; AVX-NEXT: retq 2441 %tmp = load i32, i32* %ptr, align 4 2442 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 2443 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 2444 ret <4 x i32> %tmp2 2445} 2446 2447; PR41249 2448define <4 x float> @shuffle_mem_pmovzx_v4f32(<2 x float>* %p0, <4 x float>* %p1) { 2449; SSE-LABEL: shuffle_mem_pmovzx_v4f32: 2450; SSE: # %bb.0: 2451; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2452; SSE-NEXT: xorps %xmm1, %xmm1 2453; SSE-NEXT: movaps %xmm0, %xmm2 2454; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2455; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 2456; SSE-NEXT: movaps %xmm2, (%rsi) 2457; SSE-NEXT: retq 2458; 2459; AVX1-LABEL: shuffle_mem_pmovzx_v4f32: 2460; AVX1: # %bb.0: 2461; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2462; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 2463; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2464; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 2465; AVX1-NEXT: vmovaps %xmm1, (%rsi) 2466; AVX1-NEXT: retq 2467; 2468; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32: 2469; AVX2OR512VL: # %bb.0: 2470; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2471; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 2472; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2473; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %xmm0 2474; AVX2OR512VL-NEXT: vmovaps %xmm1, (%rsi) 2475; AVX2OR512VL-NEXT: retq 2476 %1 = load <2 x float>, <2 x float>* %p0 2477 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1> 2478 %3 = shufflevector <4 x float> %2, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 2479 %4 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> zeroinitializer 2480 store <4 x float> %3, <4 x float>* %p1 2481 ret <4 x float> %4 2482} 2483 2484; 2485; Shuffle to logical bit shifts 2486; 2487 2488define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) { 2489; SSE-LABEL: shuffle_v4i32_z0zX: 2490; SSE: # %bb.0: 2491; SSE-NEXT: psllq $32, %xmm0 2492; SSE-NEXT: retq 2493; 2494; AVX-LABEL: shuffle_v4i32_z0zX: 2495; AVX: # %bb.0: 2496; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 2497; AVX-NEXT: retq 2498 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef> 2499 ret <4 x i32> %shuffle 2500} 2501 2502define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) { 2503; SSE-LABEL: shuffle_v4i32_1z3z: 2504; SSE: # %bb.0: 2505; SSE-NEXT: psrlq $32, %xmm0 2506; SSE-NEXT: retq 2507; 2508; AVX-LABEL: shuffle_v4i32_1z3z: 2509; AVX: # %bb.0: 2510; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 2511; AVX-NEXT: retq 2512 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 2513 ret <4 x i32> %shuffle 2514} 2515 2516define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) { 2517; SSE-LABEL: shuffle_mem_v4f32_0145: 2518; SSE: # %bb.0: 2519; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 2520; SSE-NEXT: retq 2521; 2522; AVX-LABEL: shuffle_mem_v4f32_0145: 2523; AVX: # %bb.0: 2524; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 2525; AVX-NEXT: retq 2526 %b = load <4 x float>, <4 x float>* %pb, align 1 2527 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 2528 ret <4 x float> %shuffle 2529} 2530 2531define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) { 2532; SSE2-LABEL: shuffle_mem_v4f32_4523: 2533; SSE2: # %bb.0: 2534; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2535; SSE2-NEXT: retq 2536; 2537; SSE3-LABEL: shuffle_mem_v4f32_4523: 2538; SSE3: # %bb.0: 2539; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2540; SSE3-NEXT: retq 2541; 2542; SSSE3-LABEL: shuffle_mem_v4f32_4523: 2543; SSSE3: # %bb.0: 2544; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2545; SSSE3-NEXT: retq 2546; 2547; SSE41-LABEL: shuffle_mem_v4f32_4523: 2548; SSE41: # %bb.0: 2549; SSE41-NEXT: movups (%rdi), %xmm1 2550; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2551; SSE41-NEXT: retq 2552; 2553; AVX-LABEL: shuffle_mem_v4f32_4523: 2554; AVX: # %bb.0: 2555; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2556; AVX-NEXT: retq 2557 %b = load <4 x float>, <4 x float>* %pb, align 1 2558 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 2559 ret <4 x float> %shuffle 2560} 2561 2562define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, <4 x float>* %a1) { 2563; SSE-LABEL: shuffle_mem_v4f32_0624: 2564; SSE: # %bb.0: 2565; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2] 2566; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,3,1] 2567; SSE-NEXT: retq 2568; 2569; AVX1OR2-LABEL: shuffle_mem_v4f32_0624: 2570; AVX1OR2: # %bb.0: 2571; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2] 2572; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] 2573; AVX1OR2-NEXT: retq 2574; 2575; AVX512VL-LABEL: shuffle_mem_v4f32_0624: 2576; AVX512VL: # %bb.0: 2577; AVX512VL-NEXT: vmovaps (%rdi), %xmm2 2578; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,2,4] 2579; AVX512VL-NEXT: vpermi2ps %xmm0, %xmm2, %xmm1 2580; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 2581; AVX512VL-NEXT: retq 2582 %1 = load <4 x float>, <4 x float>* %a1 2583 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 2584 ret <4 x float> %2 2585} 2586 2587define <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, <4 x float>* %a1) { 2588; SSE-LABEL: shuffle_mem_v4f32_4760: 2589; SSE: # %bb.0: 2590; SSE-NEXT: movaps %xmm0, %xmm1 2591; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0] 2592; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2] 2593; SSE-NEXT: retq 2594; 2595; AVX1OR2-LABEL: shuffle_mem_v4f32_4760: 2596; AVX1OR2: # %bb.0: 2597; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0] 2598; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2] 2599; AVX1OR2-NEXT: retq 2600; 2601; AVX512VL-LABEL: shuffle_mem_v4f32_4760: 2602; AVX512VL: # %bb.0: 2603; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,3,2,4] 2604; AVX512VL-NEXT: vpermt2ps (%rdi), %xmm1, %xmm0 2605; AVX512VL-NEXT: retq 2606 %1 = load <4 x float>, <4 x float>* %a1 2607 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 4, i32 7, i32 6, i32 0> 2608 ret <4 x float> %2 2609} 2610