1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2 --check-prefix=AVX2-SLOW 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2 --check-prefix=AVX2-FAST 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST 7 8define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) { 9; AVX1-LABEL: shuffle_v4f64_0000: 10; AVX1: # %bb.0: 11; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 12; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 13; AVX1-NEXT: retq 14; 15; AVX2-LABEL: shuffle_v4f64_0000: 16; AVX2: # %bb.0: 17; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 18; AVX2-NEXT: retq 19; 20; AVX512VL-LABEL: shuffle_v4f64_0000: 21; AVX512VL: # %bb.0: 22; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0 23; AVX512VL-NEXT: retq 24 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 25 ret <4 x double> %shuffle 26} 27 28define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) { 29; AVX1-LABEL: shuffle_v4f64_0001: 30; AVX1: # %bb.0: 31; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] 32; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 33; AVX1-NEXT: retq 34; 35; AVX2-LABEL: shuffle_v4f64_0001: 36; AVX2: # %bb.0: 37; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] 38; AVX2-NEXT: retq 39; 40; AVX512VL-LABEL: shuffle_v4f64_0001: 41; AVX512VL: # %bb.0: 42; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] 43; AVX512VL-NEXT: retq 44 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> 45 ret <4 x double> %shuffle 46} 47 48define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) { 49; AVX1-LABEL: shuffle_v4f64_0020: 50; AVX1: # %bb.0: 51; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 52; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 53; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 54; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 55; AVX1-NEXT: retq 56; 57; AVX2-LABEL: shuffle_v4f64_0020: 58; AVX2: # %bb.0: 59; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] 60; AVX2-NEXT: retq 61; 62; AVX512VL-LABEL: shuffle_v4f64_0020: 63; AVX512VL: # %bb.0: 64; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] 65; AVX512VL-NEXT: retq 66 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> 67 ret <4 x double> %shuffle 68} 69 70define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) { 71; AVX1-LABEL: shuffle_v4f64_0300: 72; AVX1: # %bb.0: 73; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 74; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2] 75; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] 76; AVX1-NEXT: retq 77; 78; AVX2-LABEL: shuffle_v4f64_0300: 79; AVX2: # %bb.0: 80; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] 81; AVX2-NEXT: retq 82; 83; AVX512VL-LABEL: shuffle_v4f64_0300: 84; AVX512VL: # %bb.0: 85; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] 86; AVX512VL-NEXT: retq 87 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> 88 ret <4 x double> %shuffle 89} 90 91define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) { 92; AVX1-LABEL: shuffle_v4f64_1000: 93; AVX1: # %bb.0: 94; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 95; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 96; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 97; AVX1-NEXT: retq 98; 99; AVX2-LABEL: shuffle_v4f64_1000: 100; AVX2: # %bb.0: 101; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] 102; AVX2-NEXT: retq 103; 104; AVX512VL-LABEL: shuffle_v4f64_1000: 105; AVX512VL: # %bb.0: 106; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] 107; AVX512VL-NEXT: retq 108 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 109 ret <4 x double> %shuffle 110} 111 112define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) { 113; AVX1-LABEL: shuffle_v4f64_2200: 114; AVX1: # %bb.0: 115; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 116; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 117; AVX1-NEXT: retq 118; 119; AVX2-LABEL: shuffle_v4f64_2200: 120; AVX2: # %bb.0: 121; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] 122; AVX2-NEXT: retq 123; 124; AVX512VL-LABEL: shuffle_v4f64_2200: 125; AVX512VL: # %bb.0: 126; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] 127; AVX512VL-NEXT: retq 128 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> 129 ret <4 x double> %shuffle 130} 131 132define <4 x double> @shuffle_v4f64_2222(<4 x double> %a, <4 x double> %b) { 133; AVX1-LABEL: shuffle_v4f64_2222: 134; AVX1: # %bb.0: 135; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 136; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 137; AVX1-NEXT: retq 138; 139; AVX2-LABEL: shuffle_v4f64_2222: 140; AVX2: # %bb.0: 141; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] 142; AVX2-NEXT: retq 143; 144; AVX512VL-LABEL: shuffle_v4f64_2222: 145; AVX512VL: # %bb.0: 146; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] 147; AVX512VL-NEXT: retq 148 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 149 ret <4 x double> %shuffle 150} 151 152define <4 x double> @shuffle_v4f64_2222_bc(<4 x i64> %a, <4 x i64> %b) { 153; AVX1-LABEL: shuffle_v4f64_2222_bc: 154; AVX1: # %bb.0: 155; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 156; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 157; AVX1-NEXT: retq 158; 159; AVX2-LABEL: shuffle_v4f64_2222_bc: 160; AVX2: # %bb.0: 161; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] 162; AVX2-NEXT: retq 163; 164; AVX512VL-LABEL: shuffle_v4f64_2222_bc: 165; AVX512VL: # %bb.0: 166; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] 167; AVX512VL-NEXT: retq 168 %tmp0 = bitcast <4 x i64> %a to <4 x double> 169 %tmp1 = bitcast <4 x i64> %b to <4 x double> 170 %shuffle = shufflevector <4 x double> %tmp0, <4 x double> %tmp1, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 171 ret <4 x double> %shuffle 172} 173 174define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) { 175; AVX1-LABEL: shuffle_v4f64_3330: 176; AVX1: # %bb.0: 177; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 178; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] 179; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2] 180; AVX1-NEXT: retq 181; 182; AVX2-LABEL: shuffle_v4f64_3330: 183; AVX2: # %bb.0: 184; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] 185; AVX2-NEXT: retq 186; 187; AVX512VL-LABEL: shuffle_v4f64_3330: 188; AVX512VL: # %bb.0: 189; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] 190; AVX512VL-NEXT: retq 191 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> 192 ret <4 x double> %shuffle 193} 194 195define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) { 196; AVX1-LABEL: shuffle_v4f64_3210: 197; AVX1: # %bb.0: 198; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 199; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 200; AVX1-NEXT: retq 201; 202; AVX2-LABEL: shuffle_v4f64_3210: 203; AVX2: # %bb.0: 204; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] 205; AVX2-NEXT: retq 206; 207; AVX512VL-LABEL: shuffle_v4f64_3210: 208; AVX512VL: # %bb.0: 209; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] 210; AVX512VL-NEXT: retq 211 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 212 ret <4 x double> %shuffle 213} 214 215define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) { 216; ALL-LABEL: shuffle_v4f64_0023: 217; ALL: # %bb.0: 218; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] 219; ALL-NEXT: retq 220 221 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3> 222 ret <4 x double> %shuffle 223} 224 225define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { 226; ALL-LABEL: shuffle_v4f64_0022: 227; ALL: # %bb.0: 228; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 229; ALL-NEXT: retq 230 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 231 ret <4 x double> %shuffle 232} 233 234define <4 x double> @shuffle_v4f64mem_0022(<4 x double>* %ptr, <4 x double> %b) { 235; ALL-LABEL: shuffle_v4f64mem_0022: 236; ALL: # %bb.0: 237; ALL-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2] 238; ALL-NEXT: retq 239 %a = load <4 x double>, <4 x double>* %ptr 240 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 241 ret <4 x double> %shuffle 242} 243 244define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) { 245; ALL-LABEL: shuffle_v4f64_1032: 246; ALL: # %bb.0: 247; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 248; ALL-NEXT: retq 249 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 250 ret <4 x double> %shuffle 251} 252 253define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) { 254; ALL-LABEL: shuffle_v4f64_1133: 255; ALL: # %bb.0: 256; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] 257; ALL-NEXT: retq 258 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 259 ret <4 x double> %shuffle 260} 261 262define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) { 263; ALL-LABEL: shuffle_v4f64_1023: 264; ALL: # %bb.0: 265; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] 266; ALL-NEXT: retq 267 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3> 268 ret <4 x double> %shuffle 269} 270 271define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) { 272; ALL-LABEL: shuffle_v4f64_1022: 273; ALL: # %bb.0: 274; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] 275; ALL-NEXT: retq 276 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2> 277 ret <4 x double> %shuffle 278} 279 280define <4 x double> @shuffle_v4f64_0213(<4 x double> %a, <4 x double> %b) { 281; AVX1-LABEL: shuffle_v4f64_0213: 282; AVX1: # %bb.0: 283; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 284; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] 285; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] 286; AVX1-NEXT: retq 287; 288; AVX2-LABEL: shuffle_v4f64_0213: 289; AVX2: # %bb.0: 290; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 291; AVX2-NEXT: retq 292; 293; AVX512VL-LABEL: shuffle_v4f64_0213: 294; AVX512VL: # %bb.0: 295; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 296; AVX512VL-NEXT: retq 297 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 298 ret <4 x double> %shuffle 299} 300 301define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { 302; ALL-LABEL: shuffle_v4f64_0423: 303; ALL: # %bb.0: 304; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] 305; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] 306; ALL-NEXT: retq 307 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 308 ret <4 x double> %shuffle 309} 310 311define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) { 312; ALL-LABEL: shuffle_v4f64_0462: 313; ALL: # %bb.0: 314; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] 315; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 316; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] 317; ALL-NEXT: retq 318 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2> 319 ret <4 x double> %shuffle 320} 321 322define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) { 323; ALL-LABEL: shuffle_v4f64_0426: 324; ALL: # %bb.0: 325; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 326; ALL-NEXT: retq 327 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 328 ret <4 x double> %shuffle 329} 330 331define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) { 332; ALL-LABEL: shuffle_v4f64_1537: 333; ALL: # %bb.0: 334; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 335; ALL-NEXT: retq 336 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 337 ret <4 x double> %shuffle 338} 339 340define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) { 341; ALL-LABEL: shuffle_v4f64_4062: 342; ALL: # %bb.0: 343; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 344; ALL-NEXT: retq 345 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2> 346 ret <4 x double> %shuffle 347} 348 349define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) { 350; ALL-LABEL: shuffle_v4f64_5173: 351; ALL: # %bb.0: 352; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 353; ALL-NEXT: retq 354 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 7, i32 3> 355 ret <4 x double> %shuffle 356} 357 358define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) { 359; ALL-LABEL: shuffle_v4f64_5163: 360; ALL: # %bb.0: 361; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3] 362; ALL-NEXT: retq 363 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3> 364 ret <4 x double> %shuffle 365} 366 367define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) { 368; ALL-LABEL: shuffle_v4f64_0527: 369; ALL: # %bb.0: 370; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 371; ALL-NEXT: retq 372 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 373 ret <4 x double> %shuffle 374} 375 376define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) { 377; ALL-LABEL: shuffle_v4f64_4163: 378; ALL: # %bb.0: 379; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 380; ALL-NEXT: retq 381 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 382 ret <4 x double> %shuffle 383} 384 385define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) { 386; ALL-LABEL: shuffle_v4f64_0145: 387; ALL: # %bb.0: 388; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 389; ALL-NEXT: retq 390 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 391 ret <4 x double> %shuffle 392} 393 394define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) { 395; ALL-LABEL: shuffle_v4f64_4501: 396; ALL: # %bb.0: 397; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 398; ALL-NEXT: retq 399 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 400 ret <4 x double> %shuffle 401} 402 403define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) { 404; ALL-LABEL: shuffle_v4f64_0167: 405; ALL: # %bb.0: 406; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 407; ALL-NEXT: retq 408 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 409 ret <4 x double> %shuffle 410} 411 412define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) { 413; ALL-LABEL: shuffle_v4f64_1054: 414; ALL: # %bb.0: 415; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 416; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 417; ALL-NEXT: retq 418 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4> 419 ret <4 x double> %shuffle 420} 421 422define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) { 423; AVX1OR2-LABEL: shuffle_v4f64_3254: 424; AVX1OR2: # %bb.0: 425; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 426; AVX1OR2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 427; AVX1OR2-NEXT: retq 428; 429; AVX512VL-SLOW-LABEL: shuffle_v4f64_3254: 430; AVX512VL-SLOW: # %bb.0: 431; AVX512VL-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 432; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 433; AVX512VL-SLOW-NEXT: retq 434; 435; AVX512VL-FAST-LABEL: shuffle_v4f64_3254: 436; AVX512VL-FAST: # %bb.0: 437; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [3,2,5,4] 438; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 439; AVX512VL-FAST-NEXT: retq 440 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4> 441 ret <4 x double> %shuffle 442} 443 444define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) { 445; AVX1OR2-LABEL: shuffle_v4f64_3276: 446; AVX1OR2: # %bb.0: 447; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 448; AVX1OR2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 449; AVX1OR2-NEXT: retq 450; 451; AVX512VL-SLOW-LABEL: shuffle_v4f64_3276: 452; AVX512VL-SLOW: # %bb.0: 453; AVX512VL-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 454; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 455; AVX512VL-SLOW-NEXT: retq 456; 457; AVX512VL-FAST-LABEL: shuffle_v4f64_3276: 458; AVX512VL-FAST: # %bb.0: 459; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [3,2,7,6] 460; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 461; AVX512VL-FAST-NEXT: retq 462 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6> 463 ret <4 x double> %shuffle 464} 465 466define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) { 467; ALL-LABEL: shuffle_v4f64_1076: 468; ALL: # %bb.0: 469; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 470; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 471; ALL-NEXT: retq 472 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6> 473 ret <4 x double> %shuffle 474} 475 476define <4 x double> @shuffle_v4f64_0415(<4 x double> %a, <4 x double> %b) { 477; AVX1-LABEL: shuffle_v4f64_0415: 478; AVX1: # %bb.0: 479; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] 480; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 481; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 482; AVX1-NEXT: retq 483; 484; AVX2-LABEL: shuffle_v4f64_0415: 485; AVX2: # %bb.0: 486; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] 487; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] 488; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 489; AVX2-NEXT: retq 490; 491; AVX512VL-LABEL: shuffle_v4f64_0415: 492; AVX512VL: # %bb.0: 493; AVX512VL-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,1,5] 494; AVX512VL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 495; AVX512VL-NEXT: retq 496 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 497 ret <4 x double> %shuffle 498} 499 500define <4 x double> @shuffle_v4f64_u062(<4 x double> %a, <4 x double> %b) { 501; ALL-LABEL: shuffle_v4f64_u062: 502; ALL: # %bb.0: 503; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 504; ALL-NEXT: retq 505 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 2> 506 ret <4 x double> %shuffle 507} 508 509define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) { 510; ALL-LABEL: shuffle_v4f64_15uu: 511; ALL: # %bb.0: 512; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 513; ALL-NEXT: retq 514 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef> 515 ret <4 x double> %shuffle 516} 517 518define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) { 519; ALL-LABEL: shuffle_v4f64_11uu: 520; ALL: # %bb.0: 521; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 522; ALL-NEXT: retq 523 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef> 524 ret <4 x double> %shuffle 525} 526 527define <4 x double> @shuffle_v4f64_22uu(<4 x double> %a, <4 x double> %b) { 528; AVX1-LABEL: shuffle_v4f64_22uu: 529; AVX1: # %bb.0: 530; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 531; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 532; AVX1-NEXT: retq 533; 534; AVX2-LABEL: shuffle_v4f64_22uu: 535; AVX2: # %bb.0: 536; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] 537; AVX2-NEXT: retq 538; 539; AVX512VL-LABEL: shuffle_v4f64_22uu: 540; AVX512VL: # %bb.0: 541; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] 542; AVX512VL-NEXT: retq 543 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 undef, i32 undef> 544 ret <4 x double> %shuffle 545} 546 547define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) { 548; AVX1-LABEL: shuffle_v4f64_3333: 549; AVX1: # %bb.0: 550; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] 551; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 552; AVX1-NEXT: retq 553; 554; AVX2-LABEL: shuffle_v4f64_3333: 555; AVX2: # %bb.0: 556; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] 557; AVX2-NEXT: retq 558; 559; AVX512VL-LABEL: shuffle_v4f64_3333: 560; AVX512VL: # %bb.0: 561; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] 562; AVX512VL-NEXT: retq 563 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 564 ret <4 x double> %shuffle 565} 566 567define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) { 568; AVX1-LABEL: shuffle_v4f64_0z3z: 569; AVX1: # %bb.0: 570; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] 571; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 572; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] 573; AVX1-NEXT: retq 574; 575; AVX2-SLOW-LABEL: shuffle_v4f64_0z3z: 576; AVX2-SLOW: # %bb.0: 577; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] 578; AVX2-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1 579; AVX2-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] 580; AVX2-SLOW-NEXT: retq 581; 582; AVX2-FAST-LABEL: shuffle_v4f64_0z3z: 583; AVX2-FAST: # %bb.0: 584; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero 585; AVX2-FAST-NEXT: retq 586; 587; AVX512VL-SLOW-LABEL: shuffle_v4f64_0z3z: 588; AVX512VL-SLOW: # %bb.0: 589; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] 590; AVX512VL-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1 591; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] 592; AVX512VL-SLOW-NEXT: retq 593; 594; AVX512VL-FAST-LABEL: shuffle_v4f64_0z3z: 595; AVX512VL-FAST: # %bb.0: 596; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero 597; AVX512VL-FAST-NEXT: retq 598 %shuffle = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 0, i32 4, i32 3, i32 4> 599 ret <4 x double> %shuffle 600} 601 602define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) { 603; AVX1-LABEL: shuffle_v4f64_1z2z: 604; AVX1: # %bb.0: 605; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 606; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] 607; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 608; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 609; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 610; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 611; AVX1-NEXT: retq 612; 613; AVX2-SLOW-LABEL: shuffle_v4f64_1z2z: 614; AVX2-SLOW: # %bb.0: 615; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 616; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 617; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] 618; AVX2-SLOW-NEXT: retq 619; 620; AVX2-FAST-LABEL: shuffle_v4f64_1z2z: 621; AVX2-FAST: # %bb.0: 622; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero 623; AVX2-FAST-NEXT: retq 624; 625; AVX512VL-SLOW-LABEL: shuffle_v4f64_1z2z: 626; AVX512VL-SLOW: # %bb.0: 627; AVX512VL-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 628; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 629; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] 630; AVX512VL-SLOW-NEXT: retq 631; 632; AVX512VL-FAST-LABEL: shuffle_v4f64_1z2z: 633; AVX512VL-FAST: # %bb.0: 634; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero 635; AVX512VL-FAST-NEXT: retq 636 %1 = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4> 637 ret <4 x double> %1 638} 639 640define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { 641; AVX1-LABEL: shuffle_v4i64_0000: 642; AVX1: # %bb.0: 643; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 644; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 645; AVX1-NEXT: retq 646; 647; AVX2-LABEL: shuffle_v4i64_0000: 648; AVX2: # %bb.0: 649; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 650; AVX2-NEXT: retq 651; 652; AVX512VL-LABEL: shuffle_v4i64_0000: 653; AVX512VL: # %bb.0: 654; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0 655; AVX512VL-NEXT: retq 656 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 657 ret <4 x i64> %shuffle 658} 659 660define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { 661; AVX1-LABEL: shuffle_v4i64_0001: 662; AVX1: # %bb.0: 663; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] 664; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 665; AVX1-NEXT: retq 666; 667; AVX2-LABEL: shuffle_v4i64_0001: 668; AVX2: # %bb.0: 669; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] 670; AVX2-NEXT: retq 671; 672; AVX512VL-LABEL: shuffle_v4i64_0001: 673; AVX512VL: # %bb.0: 674; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] 675; AVX512VL-NEXT: retq 676 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> 677 ret <4 x i64> %shuffle 678} 679 680define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) { 681; AVX1-LABEL: shuffle_v4i64_0020: 682; AVX1: # %bb.0: 683; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 684; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 685; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 686; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 687; AVX1-NEXT: retq 688; 689; AVX2-LABEL: shuffle_v4i64_0020: 690; AVX2: # %bb.0: 691; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] 692; AVX2-NEXT: retq 693; 694; AVX512VL-LABEL: shuffle_v4i64_0020: 695; AVX512VL: # %bb.0: 696; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] 697; AVX512VL-NEXT: retq 698 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> 699 ret <4 x i64> %shuffle 700} 701 702define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) { 703; AVX1-LABEL: shuffle_v4i64_0112: 704; AVX1: # %bb.0: 705; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 706; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 707; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 708; AVX1-NEXT: retq 709; 710; AVX2-LABEL: shuffle_v4i64_0112: 711; AVX2: # %bb.0: 712; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2] 713; AVX2-NEXT: retq 714; 715; AVX512VL-LABEL: shuffle_v4i64_0112: 716; AVX512VL: # %bb.0: 717; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2] 718; AVX512VL-NEXT: retq 719 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2> 720 ret <4 x i64> %shuffle 721} 722 723define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { 724; AVX1-LABEL: shuffle_v4i64_0300: 725; AVX1: # %bb.0: 726; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 727; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2] 728; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] 729; AVX1-NEXT: retq 730; 731; AVX2-LABEL: shuffle_v4i64_0300: 732; AVX2: # %bb.0: 733; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] 734; AVX2-NEXT: retq 735; 736; AVX512VL-LABEL: shuffle_v4i64_0300: 737; AVX512VL: # %bb.0: 738; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] 739; AVX512VL-NEXT: retq 740 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> 741 ret <4 x i64> %shuffle 742} 743 744define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { 745; AVX1-LABEL: shuffle_v4i64_1000: 746; AVX1: # %bb.0: 747; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] 748; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 749; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 750; AVX1-NEXT: retq 751; 752; AVX2-LABEL: shuffle_v4i64_1000: 753; AVX2: # %bb.0: 754; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] 755; AVX2-NEXT: retq 756; 757; AVX512VL-LABEL: shuffle_v4i64_1000: 758; AVX512VL: # %bb.0: 759; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] 760; AVX512VL-NEXT: retq 761 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 762 ret <4 x i64> %shuffle 763} 764 765define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) { 766; AVX1-LABEL: shuffle_v4i64_2200: 767; AVX1: # %bb.0: 768; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 769; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 770; AVX1-NEXT: retq 771; 772; AVX2-LABEL: shuffle_v4i64_2200: 773; AVX2: # %bb.0: 774; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] 775; AVX2-NEXT: retq 776; 777; AVX512VL-LABEL: shuffle_v4i64_2200: 778; AVX512VL: # %bb.0: 779; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] 780; AVX512VL-NEXT: retq 781 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> 782 ret <4 x i64> %shuffle 783} 784 785define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) { 786; AVX1-LABEL: shuffle_v4i64_3330: 787; AVX1: # %bb.0: 788; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 789; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] 790; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2] 791; AVX1-NEXT: retq 792; 793; AVX2-LABEL: shuffle_v4i64_3330: 794; AVX2: # %bb.0: 795; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] 796; AVX2-NEXT: retq 797; 798; AVX512VL-LABEL: shuffle_v4i64_3330: 799; AVX512VL: # %bb.0: 800; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] 801; AVX512VL-NEXT: retq 802 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> 803 ret <4 x i64> %shuffle 804} 805 806define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) { 807; AVX1-LABEL: shuffle_v4i64_3210: 808; AVX1: # %bb.0: 809; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 810; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 811; AVX1-NEXT: retq 812; 813; AVX2-LABEL: shuffle_v4i64_3210: 814; AVX2: # %bb.0: 815; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] 816; AVX2-NEXT: retq 817; 818; AVX512VL-LABEL: shuffle_v4i64_3210: 819; AVX512VL: # %bb.0: 820; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] 821; AVX512VL-NEXT: retq 822 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 823 ret <4 x i64> %shuffle 824} 825 826define <4 x i64> @shuffle_v4i64_0213(<4 x i64> %a, <4 x i64> %b) { 827; AVX1-LABEL: shuffle_v4i64_0213: 828; AVX1: # %bb.0: 829; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 830; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] 831; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] 832; AVX1-NEXT: retq 833; 834; AVX2-LABEL: shuffle_v4i64_0213: 835; AVX2: # %bb.0: 836; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 837; AVX2-NEXT: retq 838; 839; AVX512VL-LABEL: shuffle_v4i64_0213: 840; AVX512VL: # %bb.0: 841; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 842; AVX512VL-NEXT: retq 843 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 844 ret <4 x i64> %shuffle 845} 846 847define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { 848; AVX1-LABEL: shuffle_v4i64_0124: 849; AVX1: # %bb.0: 850; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] 851; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 852; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 853; AVX1-NEXT: retq 854; 855; AVX2-LABEL: shuffle_v4i64_0124: 856; AVX2: # %bb.0: 857; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 858; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 859; AVX2-NEXT: retq 860; 861; AVX512VL-SLOW-LABEL: shuffle_v4i64_0124: 862; AVX512VL-SLOW: # %bb.0: 863; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 864; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 865; AVX512VL-SLOW-NEXT: retq 866; 867; AVX512VL-FAST-LABEL: shuffle_v4i64_0124: 868; AVX512VL-FAST: # %bb.0: 869; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,4] 870; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 871; AVX512VL-FAST-NEXT: retq 872 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 873 ret <4 x i64> %shuffle 874} 875 876define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { 877; AVX1-LABEL: shuffle_v4i64_0142: 878; AVX1: # %bb.0: 879; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 880; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2] 881; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] 882; AVX1-NEXT: retq 883; 884; AVX2-LABEL: shuffle_v4i64_0142: 885; AVX2: # %bb.0: 886; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 887; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2] 888; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 889; AVX2-NEXT: retq 890; 891; AVX512VL-LABEL: shuffle_v4i64_0142: 892; AVX512VL: # %bb.0: 893; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 894; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2] 895; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 896; AVX512VL-NEXT: retq 897 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 898 ret <4 x i64> %shuffle 899} 900 901define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { 902; AVX1-LABEL: shuffle_v4i64_0412: 903; AVX1: # %bb.0: 904; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] 905; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 906; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] 907; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 908; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] 909; AVX1-NEXT: retq 910; 911; AVX2-LABEL: shuffle_v4i64_0412: 912; AVX2: # %bb.0: 913; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 914; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] 915; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 916; AVX2-NEXT: retq 917; 918; AVX512VL-SLOW-LABEL: shuffle_v4i64_0412: 919; AVX512VL-SLOW: # %bb.0: 920; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm1, %xmm1 921; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] 922; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 923; AVX512VL-SLOW-NEXT: retq 924; 925; AVX512VL-FAST-LABEL: shuffle_v4i64_0412: 926; AVX512VL-FAST: # %bb.0: 927; AVX512VL-FAST-NEXT: vpbroadcastq %xmm1, %xmm1 928; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2] 929; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 930; AVX512VL-FAST-NEXT: retq 931 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2> 932 ret <4 x i64> %shuffle 933} 934 935define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { 936; AVX1-LABEL: shuffle_v4i64_4012: 937; AVX1: # %bb.0: 938; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 939; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] 940; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 941; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 942; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 943; AVX1-NEXT: retq 944; 945; AVX2-LABEL: shuffle_v4i64_4012: 946; AVX2: # %bb.0: 947; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2] 948; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 949; AVX2-NEXT: retq 950; 951; AVX512VL-SLOW-LABEL: shuffle_v4i64_4012: 952; AVX512VL-SLOW: # %bb.0: 953; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2] 954; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 955; AVX512VL-SLOW-NEXT: retq 956; 957; AVX512VL-FAST-LABEL: shuffle_v4i64_4012: 958; AVX512VL-FAST: # %bb.0: 959; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2] 960; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 961; AVX512VL-FAST-NEXT: retq 962 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2> 963 ret <4 x i64> %shuffle 964} 965 966define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) { 967; ALL-LABEL: shuffle_v4i64_0145: 968; ALL: # %bb.0: 969; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 970; ALL-NEXT: retq 971 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 972 ret <4 x i64> %shuffle 973} 974 975define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { 976; AVX1-LABEL: shuffle_v4i64_0451: 977; AVX1: # %bb.0: 978; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] 979; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 980; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 981; AVX1-NEXT: retq 982; 983; AVX2-LABEL: shuffle_v4i64_0451: 984; AVX2: # %bb.0: 985; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,3] 986; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] 987; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] 988; AVX2-NEXT: retq 989; 990; AVX512VL-LABEL: shuffle_v4i64_0451: 991; AVX512VL: # %bb.0: 992; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,5,1] 993; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 994; AVX512VL-NEXT: retq 995 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1> 996 ret <4 x i64> %shuffle 997} 998 999define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) { 1000; ALL-LABEL: shuffle_v4i64_4501: 1001; ALL: # %bb.0: 1002; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1003; ALL-NEXT: retq 1004 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 1005 ret <4 x i64> %shuffle 1006} 1007 1008define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { 1009; AVX1-LABEL: shuffle_v4i64_4015: 1010; AVX1: # %bb.0: 1011; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] 1012; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1013; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1014; AVX1-NEXT: retq 1015; 1016; AVX2-LABEL: shuffle_v4i64_4015: 1017; AVX2: # %bb.0: 1018; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] 1019; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] 1020; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] 1021; AVX2-NEXT: retq 1022; 1023; AVX512VL-LABEL: shuffle_v4i64_4015: 1024; AVX512VL: # %bb.0: 1025; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,5] 1026; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 1027; AVX512VL-NEXT: retq 1028 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5> 1029 ret <4 x i64> %shuffle 1030} 1031 1032define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) { 1033; AVX1-LABEL: shuffle_v4i64_2u35: 1034; AVX1: # %bb.0: 1035; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1036; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] 1037; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1038; AVX1-NEXT: retq 1039; 1040; AVX2-LABEL: shuffle_v4i64_2u35: 1041; AVX2: # %bb.0: 1042; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 1043; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] 1044; AVX2-NEXT: retq 1045; 1046; AVX512VL-SLOW-LABEL: shuffle_v4i64_2u35: 1047; AVX512VL-SLOW: # %bb.0: 1048; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 1049; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] 1050; AVX512VL-SLOW-NEXT: retq 1051; 1052; AVX512VL-FAST-LABEL: shuffle_v4i64_2u35: 1053; AVX512VL-FAST: # %bb.0: 1054; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,3,5] 1055; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 1056; AVX512VL-FAST-NEXT: retq 1057 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5> 1058 ret <4 x i64> %shuffle 1059} 1060 1061define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) { 1062; AVX1-LABEL: shuffle_v4i64_1251: 1063; AVX1: # %bb.0: 1064; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] 1065; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[3] 1066; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 1067; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 1068; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] 1069; AVX1-NEXT: retq 1070; 1071; AVX2-LABEL: shuffle_v4i64_1251: 1072; AVX2: # %bb.0: 1073; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] 1074; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,1] 1075; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] 1076; AVX2-NEXT: retq 1077; 1078; AVX512VL-LABEL: shuffle_v4i64_1251: 1079; AVX512VL: # %bb.0: 1080; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,5,1] 1081; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 1082; AVX512VL-NEXT: retq 1083 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1> 1084 ret <4 x i64> %shuffle 1085} 1086 1087define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) { 1088; AVX1-LABEL: shuffle_v4i64_1054: 1089; AVX1: # %bb.0: 1090; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1091; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 1092; AVX1-NEXT: retq 1093; 1094; AVX2-LABEL: shuffle_v4i64_1054: 1095; AVX2: # %bb.0: 1096; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1097; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] 1098; AVX2-NEXT: retq 1099; 1100; AVX512VL-LABEL: shuffle_v4i64_1054: 1101; AVX512VL: # %bb.0: 1102; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1103; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] 1104; AVX512VL-NEXT: retq 1105 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4> 1106 ret <4 x i64> %shuffle 1107} 1108 1109define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) { 1110; AVX1-LABEL: shuffle_v4i64_3254: 1111; AVX1: # %bb.0: 1112; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 1113; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 1114; AVX1-NEXT: retq 1115; 1116; AVX2-LABEL: shuffle_v4i64_3254: 1117; AVX2: # %bb.0: 1118; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 1119; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] 1120; AVX2-NEXT: retq 1121; 1122; AVX512VL-SLOW-LABEL: shuffle_v4i64_3254: 1123; AVX512VL-SLOW: # %bb.0: 1124; AVX512VL-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 1125; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] 1126; AVX512VL-SLOW-NEXT: retq 1127; 1128; AVX512VL-FAST-LABEL: shuffle_v4i64_3254: 1129; AVX512VL-FAST: # %bb.0: 1130; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,5,4] 1131; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 1132; AVX512VL-FAST-NEXT: retq 1133 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4> 1134 ret <4 x i64> %shuffle 1135} 1136 1137define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) { 1138; AVX1-LABEL: shuffle_v4i64_3276: 1139; AVX1: # %bb.0: 1140; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1141; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 1142; AVX1-NEXT: retq 1143; 1144; AVX2-LABEL: shuffle_v4i64_3276: 1145; AVX2: # %bb.0: 1146; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1147; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] 1148; AVX2-NEXT: retq 1149; 1150; AVX512VL-SLOW-LABEL: shuffle_v4i64_3276: 1151; AVX512VL-SLOW: # %bb.0: 1152; AVX512VL-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1153; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] 1154; AVX512VL-SLOW-NEXT: retq 1155; 1156; AVX512VL-FAST-LABEL: shuffle_v4i64_3276: 1157; AVX512VL-FAST: # %bb.0: 1158; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,7,6] 1159; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 1160; AVX512VL-FAST-NEXT: retq 1161 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6> 1162 ret <4 x i64> %shuffle 1163} 1164 1165define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) { 1166; AVX1-LABEL: shuffle_v4i64_1076: 1167; AVX1: # %bb.0: 1168; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 1169; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 1170; AVX1-NEXT: retq 1171; 1172; AVX2-LABEL: shuffle_v4i64_1076: 1173; AVX2: # %bb.0: 1174; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1175; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] 1176; AVX2-NEXT: retq 1177; 1178; AVX512VL-LABEL: shuffle_v4i64_1076: 1179; AVX512VL: # %bb.0: 1180; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1181; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] 1182; AVX512VL-NEXT: retq 1183 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6> 1184 ret <4 x i64> %shuffle 1185} 1186 1187define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) { 1188; AVX1-LABEL: shuffle_v4i64_0415: 1189; AVX1: # %bb.0: 1190; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] 1191; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1192; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1193; AVX1-NEXT: retq 1194; 1195; AVX2-LABEL: shuffle_v4i64_0415: 1196; AVX2: # %bb.0: 1197; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] 1198; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] 1199; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 1200; AVX2-NEXT: retq 1201; 1202; AVX512VL-LABEL: shuffle_v4i64_0415: 1203; AVX512VL: # %bb.0: 1204; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,1,5] 1205; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 1206; AVX512VL-NEXT: retq 1207 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 1208 ret <4 x i64> %shuffle 1209} 1210 1211define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) { 1212; AVX1-LABEL: shuffle_v4i64_z4z6: 1213; AVX1: # %bb.0: 1214; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1215; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 1216; AVX1-NEXT: retq 1217; 1218; AVX2-LABEL: shuffle_v4i64_z4z6: 1219; AVX2: # %bb.0: 1220; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23] 1221; AVX2-NEXT: retq 1222; 1223; AVX512VL-LABEL: shuffle_v4i64_z4z6: 1224; AVX512VL: # %bb.0: 1225; AVX512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23] 1226; AVX512VL-NEXT: retq 1227 %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 0, i32 4, i32 0, i32 6> 1228 ret <4 x i64> %shuffle 1229} 1230 1231define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) { 1232; AVX1-LABEL: shuffle_v4i64_5zuz: 1233; AVX1: # %bb.0: 1234; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1235; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 1236; AVX1-NEXT: retq 1237; 1238; AVX2-LABEL: shuffle_v4i64_5zuz: 1239; AVX2: # %bb.0: 1240; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero 1241; AVX2-NEXT: retq 1242; 1243; AVX512VL-LABEL: shuffle_v4i64_5zuz: 1244; AVX512VL: # %bb.0: 1245; AVX512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero 1246; AVX512VL-NEXT: retq 1247 %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 5, i32 0, i32 undef, i32 0> 1248 ret <4 x i64> %shuffle 1249} 1250 1251define <4 x i64> @shuffle_v4i64_40u2(<4 x i64> %a, <4 x i64> %b) { 1252; ALL-LABEL: shuffle_v4i64_40u2: 1253; ALL: # %bb.0: 1254; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 1255; ALL-NEXT: retq 1256 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 2> 1257 ret <4 x i64> %shuffle 1258} 1259 1260define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) { 1261; ALL-LABEL: shuffle_v4i64_15uu: 1262; ALL: # %bb.0: 1263; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1264; ALL-NEXT: retq 1265 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef> 1266 ret <4 x i64> %shuffle 1267} 1268 1269define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) { 1270; ALL-LABEL: shuffle_v4i64_11uu: 1271; ALL: # %bb.0: 1272; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 1273; ALL-NEXT: retq 1274 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef> 1275 ret <4 x i64> %shuffle 1276} 1277 1278define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) { 1279; AVX1-LABEL: shuffle_v4i64_22uu: 1280; AVX1: # %bb.0: 1281; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1282; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1283; AVX1-NEXT: retq 1284; 1285; AVX2-LABEL: shuffle_v4i64_22uu: 1286; AVX2: # %bb.0: 1287; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] 1288; AVX2-NEXT: retq 1289; 1290; AVX512VL-LABEL: shuffle_v4i64_22uu: 1291; AVX512VL: # %bb.0: 1292; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] 1293; AVX512VL-NEXT: retq 1294 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 undef, i32 undef> 1295 ret <4 x i64> %shuffle 1296} 1297 1298define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) { 1299; AVX1-LABEL: shuffle_v4i64_3333: 1300; AVX1: # %bb.0: 1301; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] 1302; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 1303; AVX1-NEXT: retq 1304; 1305; AVX2-LABEL: shuffle_v4i64_3333: 1306; AVX2: # %bb.0: 1307; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] 1308; AVX2-NEXT: retq 1309; 1310; AVX512VL-LABEL: shuffle_v4i64_3333: 1311; AVX512VL: # %bb.0: 1312; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] 1313; AVX512VL-NEXT: retq 1314 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1315 ret <4 x i64> %shuffle 1316} 1317 1318define <4 x i64> @shuffle_v4i64_1z3z(<4 x i64> %a, <4 x i64> %b) { 1319; AVX1-LABEL: shuffle_v4i64_1z3z: 1320; AVX1: # %bb.0: 1321; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1322; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] 1323; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1324; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] 1325; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1326; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1327; AVX1-NEXT: retq 1328; 1329; AVX2-LABEL: shuffle_v4i64_1z3z: 1330; AVX2: # %bb.0: 1331; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero 1332; AVX2-NEXT: retq 1333; 1334; AVX512VL-LABEL: shuffle_v4i64_1z3z: 1335; AVX512VL: # %bb.0: 1336; AVX512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero 1337; AVX512VL-NEXT: retq 1338 %shuffle = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 1339 ret <4 x i64> %shuffle 1340} 1341 1342define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { 1343; ALL-LABEL: stress_test1: 1344; ALL: retq 1345 %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0> 1346 %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> <i32 3, i32 undef, i32 2, i32 undef> 1347 %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 undef> 1348 %f = shufflevector <4 x i64> %d, <4 x i64> %e, <4 x i32> <i32 5, i32 1, i32 1, i32 0> 1349 1350 ret <4 x i64> %f 1351} 1352 1353define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) { 1354; ALL-LABEL: insert_reg_and_zero_v4i64: 1355; ALL: # %bb.0: 1356; ALL-NEXT: vmovq %rdi, %xmm0 1357; ALL-NEXT: retq 1358 %v = insertelement <4 x i64> undef, i64 %a, i64 0 1359 %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1360 ret <4 x i64> %shuffle 1361} 1362 1363define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { 1364; ALL-LABEL: insert_mem_and_zero_v4i64: 1365; ALL: # %bb.0: 1366; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1367; ALL-NEXT: retq 1368 %a = load i64, i64* %ptr 1369 %v = insertelement <4 x i64> undef, i64 %a, i64 0 1370 %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1371 ret <4 x i64> %shuffle 1372} 1373 1374define <4 x double> @insert_reg_and_zero_v4f64(double %a) { 1375; ALL-LABEL: insert_reg_and_zero_v4f64: 1376; ALL: # %bb.0: 1377; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1378; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 1379; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1380; ALL-NEXT: retq 1381 %v = insertelement <4 x double> undef, double %a, i32 0 1382 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1383 ret <4 x double> %shuffle 1384} 1385 1386define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) { 1387; ALL-LABEL: insert_mem_and_zero_v4f64: 1388; ALL: # %bb.0: 1389; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1390; ALL-NEXT: retq 1391 %a = load double, double* %ptr 1392 %v = insertelement <4 x double> undef, double %a, i32 0 1393 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1394 ret <4 x double> %shuffle 1395} 1396 1397define <4 x double> @splat_mem_v4f64(double* %ptr) { 1398; ALL-LABEL: splat_mem_v4f64: 1399; ALL: # %bb.0: 1400; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 1401; ALL-NEXT: retq 1402 %a = load double, double* %ptr 1403 %v = insertelement <4 x double> undef, double %a, i32 0 1404 %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 1405 ret <4 x double> %shuffle 1406} 1407 1408define <4 x i64> @splat_mem_v4i64(i64* %ptr) { 1409; ALL-LABEL: splat_mem_v4i64: 1410; ALL: # %bb.0: 1411; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 1412; ALL-NEXT: retq 1413 %a = load i64, i64* %ptr 1414 %v = insertelement <4 x i64> undef, i64 %a, i64 0 1415 %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 1416 ret <4 x i64> %shuffle 1417} 1418 1419define <4 x double> @splat_mem_v4f64_2(double* %p) { 1420; ALL-LABEL: splat_mem_v4f64_2: 1421; ALL: # %bb.0: 1422; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 1423; ALL-NEXT: retq 1424 %1 = load double, double* %p 1425 %2 = insertelement <2 x double> undef, double %1, i32 0 1426 %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer 1427 ret <4 x double> %3 1428} 1429 1430define <4 x double> @splat_v4f64(<2 x double> %r) { 1431; AVX1-LABEL: splat_v4f64: 1432; AVX1: # %bb.0: 1433; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1434; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1435; AVX1-NEXT: retq 1436; 1437; AVX2-LABEL: splat_v4f64: 1438; AVX2: # %bb.0: 1439; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 1440; AVX2-NEXT: retq 1441; 1442; AVX512VL-LABEL: splat_v4f64: 1443; AVX512VL: # %bb.0: 1444; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0 1445; AVX512VL-NEXT: retq 1446 %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer 1447 ret <4 x double> %1 1448} 1449 1450define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) { 1451; ALL-LABEL: splat_mem_v4i64_from_v2i64: 1452; ALL: # %bb.0: 1453; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 1454; ALL-NEXT: retq 1455 %v = load <2 x i64>, <2 x i64>* %ptr 1456 %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 1457 ret <4 x i64> %shuffle 1458} 1459 1460define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) { 1461; ALL-LABEL: splat_mem_v4f64_from_v2f64: 1462; ALL: # %bb.0: 1463; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 1464; ALL-NEXT: retq 1465 %v = load <2 x double>, <2 x double>* %ptr 1466 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 1467 ret <4 x double> %shuffle 1468} 1469 1470define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) { 1471; AVX1OR2-LABEL: splat128_mem_v4i64_from_v2i64: 1472; AVX1OR2: # %bb.0: 1473; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1474; AVX1OR2-NEXT: retq 1475; 1476; AVX512VL-LABEL: splat128_mem_v4i64_from_v2i64: 1477; AVX512VL: # %bb.0: 1478; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 1479; AVX512VL-NEXT: retq 1480 %v = load <2 x i64>, <2 x i64>* %ptr 1481 %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1482 ret <4 x i64> %shuffle 1483} 1484 1485define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) { 1486; ALL-LABEL: splat128_mem_v4f64_from_v2f64: 1487; ALL: # %bb.0: 1488; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1489; ALL-NEXT: retq 1490 %v = load <2 x double>, <2 x double>* %ptr 1491 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1492 ret <4 x double> %shuffle 1493} 1494 1495define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) { 1496; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64: 1497; AVX1: # %bb.0: 1498; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1499; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1500; AVX1-NEXT: retq 1501; 1502; AVX2-LABEL: broadcast_v4f64_0000_from_v2i64: 1503; AVX2: # %bb.0: 1504; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 1505; AVX2-NEXT: retq 1506; 1507; AVX512VL-LABEL: broadcast_v4f64_0000_from_v2i64: 1508; AVX512VL: # %bb.0: 1509; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0 1510; AVX512VL-NEXT: retq 1511 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1512 %2 = bitcast <4 x i64> %1 to <4 x double> 1513 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer 1514 ret <4 x double> %3 1515} 1516 1517define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) { 1518; ALL-LABEL: bitcast_v4f64_0426: 1519; ALL: # %bb.0: 1520; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 1521; ALL-NEXT: retq 1522 %shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2> 1523 %bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float> 1524 %shuffle32 = shufflevector <8 x float> %bitcast32, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 1525 %bitcast16 = bitcast <8 x float> %shuffle32 to <16 x i16> 1526 %shuffle16 = shufflevector <16 x i16> %bitcast16, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13> 1527 %bitcast64 = bitcast <16 x i16> %shuffle16 to <4 x double> 1528 ret <4 x double> %bitcast64 1529} 1530 1531define <4 x i64> @concat_v4i64_0167(<4 x i64> %a0, <4 x i64> %a1) { 1532; ALL-LABEL: concat_v4i64_0167: 1533; ALL: # %bb.0: 1534; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1535; ALL-NEXT: retq 1536 %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1> 1537 %a1hi = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 6, i32 7> 1538 %shuffle64 = shufflevector <2 x i64> %a0lo, <2 x i64> %a1hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1539 ret <4 x i64> %shuffle64 1540} 1541 1542define <4 x i64> @concat_v4i64_0145_bc(<4 x i64> %a0, <4 x i64> %a1) { 1543; ALL-LABEL: concat_v4i64_0145_bc: 1544; ALL: # %bb.0: 1545; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1546; ALL-NEXT: retq 1547 %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1> 1548 %a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 4, i32 5> 1549 %bc0lo = bitcast <2 x i64> %a0lo to <4 x i32> 1550 %bc1lo = bitcast <2 x i64> %a1lo to <4 x i32> 1551 %shuffle32 = shufflevector <4 x i32> %bc0lo, <4 x i32> %bc1lo, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1552 %shuffle64 = bitcast <8 x i32> %shuffle32 to <4 x i64> 1553 ret <4 x i64> %shuffle64 1554} 1555 1556define <4 x i64> @insert_dup_mem_v4i64(i64* %ptr) { 1557; ALL-LABEL: insert_dup_mem_v4i64: 1558; ALL: # %bb.0: 1559; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 1560; ALL-NEXT: retq 1561 %tmp = load i64, i64* %ptr, align 1 1562 %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0 1563 %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <4 x i32> zeroinitializer 1564 ret <4 x i64> %tmp2 1565} 1566 1567define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) { 1568; AVX1-LABEL: shuffle_v4i64_1234: 1569; AVX1: # %bb.0: 1570; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] 1571; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 1572; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2] 1573; AVX1-NEXT: retq 1574; 1575; AVX2-LABEL: shuffle_v4i64_1234: 1576; AVX2: # %bb.0: 1577; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 1578; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0] 1579; AVX2-NEXT: retq 1580; 1581; AVX512VL-LABEL: shuffle_v4i64_1234: 1582; AVX512VL: # %bb.0: 1583; AVX512VL-NEXT: valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0] 1584; AVX512VL-NEXT: retq 1585 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 1586 ret <4 x i64> %shuffle 1587} 1588 1589define <4 x i64> @shuffle_v4i64_1230(<4 x i64> %a) { 1590; AVX1-LABEL: shuffle_v4i64_1230: 1591; AVX1: # %bb.0: 1592; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 1593; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2] 1594; AVX1-NEXT: retq 1595; 1596; AVX2-LABEL: shuffle_v4i64_1230: 1597; AVX2: # %bb.0: 1598; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0] 1599; AVX2-NEXT: retq 1600; 1601; AVX512VL-LABEL: shuffle_v4i64_1230: 1602; AVX512VL: # %bb.0: 1603; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0] 1604; AVX512VL-NEXT: retq 1605 %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> 1606 ret <4 x i64> %shuffle 1607} 1608 1609define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) { 1610; AVX1-LABEL: shuffle_v4i64_z0z3: 1611; AVX1: # %bb.0: 1612; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] 1613; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1614; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] 1615; AVX1-NEXT: retq 1616; 1617; AVX2-SLOW-LABEL: shuffle_v4i64_z0z3: 1618; AVX2-SLOW: # %bb.0: 1619; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3] 1620; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 1621; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 1622; AVX2-SLOW-NEXT: retq 1623; 1624; AVX2-FAST-LABEL: shuffle_v4i64_z0z3: 1625; AVX2-FAST: # %bb.0: 1626; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31] 1627; AVX2-FAST-NEXT: retq 1628; 1629; AVX512VL-SLOW-LABEL: shuffle_v4i64_z0z3: 1630; AVX512VL-SLOW: # %bb.0: 1631; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3] 1632; AVX512VL-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 1633; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] 1634; AVX512VL-SLOW-NEXT: retq 1635; 1636; AVX512VL-FAST-LABEL: shuffle_v4i64_z0z3: 1637; AVX512VL-FAST: # %bb.0: 1638; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31] 1639; AVX512VL-FAST-NEXT: retq 1640 %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 3> 1641 ret <4 x i64> %1 1642} 1643 1644define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) { 1645; AVX1-LABEL: shuffle_v4i64_1z2z: 1646; AVX1: # %bb.0: 1647; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1648; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] 1649; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1650; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 1651; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1652; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1653; AVX1-NEXT: retq 1654; 1655; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z: 1656; AVX2-SLOW: # %bb.0: 1657; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 1658; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 1659; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] 1660; AVX2-SLOW-NEXT: retq 1661; 1662; AVX2-FAST-LABEL: shuffle_v4i64_1z2z: 1663; AVX2-FAST: # %bb.0: 1664; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero 1665; AVX2-FAST-NEXT: retq 1666; 1667; AVX512VL-SLOW-LABEL: shuffle_v4i64_1z2z: 1668; AVX512VL-SLOW: # %bb.0: 1669; AVX512VL-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 1670; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] 1671; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] 1672; AVX512VL-SLOW-NEXT: retq 1673; 1674; AVX512VL-FAST-LABEL: shuffle_v4i64_1z2z: 1675; AVX512VL-FAST: # %bb.0: 1676; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero 1677; AVX512VL-FAST-NEXT: retq 1678 %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4> 1679 ret <4 x i64> %1 1680} 1681 1682define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) { 1683; AVX1-LABEL: add_v4f64_0246_1357: 1684; AVX1: # %bb.0: # %entry 1685; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1686; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0] 1687; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 1688; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 1689; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],xmm4[0] 1690; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] 1691; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 1692; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 1693; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] 1694; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 1695; AVX1-NEXT: vaddpd %ymm0, %ymm3, %ymm0 1696; AVX1-NEXT: retq 1697; 1698; AVX2-LABEL: add_v4f64_0246_1357: 1699; AVX2: # %bb.0: # %entry 1700; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 1701; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] 1702; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 1703; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1704; AVX2-NEXT: vaddpd %ymm0, %ymm2, %ymm0 1705; AVX2-NEXT: retq 1706; 1707; AVX512VL-SLOW-LABEL: add_v4f64_0246_1357: 1708; AVX512VL-SLOW: # %bb.0: # %entry 1709; AVX512VL-SLOW-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 1710; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] 1711; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 1712; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1713; AVX512VL-SLOW-NEXT: vaddpd %ymm0, %ymm2, %ymm0 1714; AVX512VL-SLOW-NEXT: retq 1715; 1716; AVX512VL-FAST-LABEL: add_v4f64_0246_1357: 1717; AVX512VL-FAST: # %bb.0: # %entry 1718; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6] 1719; AVX512VL-FAST-NEXT: vpermi2pd %ymm1, %ymm0, %ymm2 1720; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,3,5,7] 1721; AVX512VL-FAST-NEXT: vpermi2pd %ymm1, %ymm0, %ymm3 1722; AVX512VL-FAST-NEXT: vaddpd %ymm3, %ymm2, %ymm0 1723; AVX512VL-FAST-NEXT: retq 1724entry: 1725 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1726 %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1727 %add = fadd <4 x double> %shuffle, %shuffle1 1728 ret <4 x double> %add 1729} 1730 1731define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) { 1732; AVX1-LABEL: add_v4f64_4602_5713: 1733; AVX1: # %bb.0: # %entry 1734; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1735; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0] 1736; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 1737; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1738; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm1[0],xmm4[0] 1739; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] 1740; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 1741; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1742; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 1743; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] 1744; AVX1-NEXT: vaddpd %ymm0, %ymm3, %ymm0 1745; AVX1-NEXT: retq 1746; 1747; AVX2-LABEL: add_v4f64_4602_5713: 1748; AVX2: # %bb.0: # %entry 1749; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 1750; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] 1751; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 1752; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1753; AVX2-NEXT: vaddpd %ymm0, %ymm2, %ymm0 1754; AVX2-NEXT: retq 1755; 1756; AVX512VL-SLOW-LABEL: add_v4f64_4602_5713: 1757; AVX512VL-SLOW: # %bb.0: # %entry 1758; AVX512VL-SLOW-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 1759; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] 1760; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 1761; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1762; AVX512VL-SLOW-NEXT: vaddpd %ymm0, %ymm2, %ymm0 1763; AVX512VL-SLOW-NEXT: retq 1764; 1765; AVX512VL-FAST-LABEL: add_v4f64_4602_5713: 1766; AVX512VL-FAST: # %bb.0: # %entry 1767; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6] 1768; AVX512VL-FAST-NEXT: vpermi2pd %ymm0, %ymm1, %ymm2 1769; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,3,5,7] 1770; AVX512VL-FAST-NEXT: vpermi2pd %ymm0, %ymm1, %ymm3 1771; AVX512VL-FAST-NEXT: vaddpd %ymm3, %ymm2, %ymm0 1772; AVX512VL-FAST-NEXT: retq 1773entry: 1774 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 6, i32 0, i32 2> 1775 %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3> 1776 %add = fadd <4 x double> %shuffle, %shuffle1 1777 ret <4 x double> %add 1778} 1779 1780define <4 x i64> @add_v4i64_0246_1357(<4 x i64> %a, <4 x i64> %b) { 1781; AVX1-LABEL: add_v4i64_0246_1357: 1782; AVX1: # %bb.0: # %entry 1783; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1784; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0] 1785; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 1786; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 1787; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm4[0] 1788; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] 1789; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 1790; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 1791; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] 1792; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 1793; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1794; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1795; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1796; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 1797; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1798; AVX1-NEXT: retq 1799; 1800; AVX2-LABEL: add_v4i64_0246_1357: 1801; AVX2: # %bb.0: # %entry 1802; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 1803; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1804; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 1805; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1806; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1807; AVX2-NEXT: retq 1808; 1809; AVX512VL-SLOW-LABEL: add_v4i64_0246_1357: 1810; AVX512VL-SLOW: # %bb.0: # %entry 1811; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 1812; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1813; AVX512VL-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 1814; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1815; AVX512VL-SLOW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1816; AVX512VL-SLOW-NEXT: retq 1817; 1818; AVX512VL-FAST-LABEL: add_v4i64_0246_1357: 1819; AVX512VL-FAST: # %bb.0: # %entry 1820; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] 1821; AVX512VL-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 1822; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] 1823; AVX512VL-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm3 1824; AVX512VL-FAST-NEXT: vpaddq %ymm3, %ymm2, %ymm0 1825; AVX512VL-FAST-NEXT: retq 1826entry: 1827 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1828 %shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1829 %add = add <4 x i64> %shuffle, %shuffle1 1830 ret <4 x i64> %add 1831} 1832 1833define <4 x i64> @add_v4i64_4602_5713(<4 x i64> %a, <4 x i64> %b) { 1834; AVX1-LABEL: add_v4i64_4602_5713: 1835; AVX1: # %bb.0: # %entry 1836; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1837; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0] 1838; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 1839; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1840; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm4[0] 1841; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] 1842; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 1843; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1844; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 1845; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 1846; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1847; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 1848; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 1849; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 1850; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1851; AVX1-NEXT: retq 1852; 1853; AVX2-LABEL: add_v4i64_4602_5713: 1854; AVX2: # %bb.0: # %entry 1855; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 1856; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1857; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 1858; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1859; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1860; AVX2-NEXT: retq 1861; 1862; AVX512VL-SLOW-LABEL: add_v4i64_4602_5713: 1863; AVX512VL-SLOW: # %bb.0: # %entry 1864; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 1865; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1866; AVX512VL-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 1867; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1868; AVX512VL-SLOW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1869; AVX512VL-SLOW-NEXT: retq 1870; 1871; AVX512VL-FAST-LABEL: add_v4i64_4602_5713: 1872; AVX512VL-FAST: # %bb.0: # %entry 1873; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] 1874; AVX512VL-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 1875; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] 1876; AVX512VL-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm3 1877; AVX512VL-FAST-NEXT: vpaddq %ymm3, %ymm2, %ymm0 1878; AVX512VL-FAST-NEXT: retq 1879entry: 1880 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 6, i32 0, i32 2> 1881 %shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3> 1882 %add = add <4 x i64> %shuffle, %shuffle1 1883 ret <4 x i64> %add 1884} 1885 1886define <4 x double> @shuffle_v4f64_0zzz_optsize(<4 x double> %a) optsize { 1887; ALL-LABEL: shuffle_v4f64_0zzz_optsize: 1888; ALL: # %bb.0: 1889; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1890; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1891; ALL-NEXT: retq 1892 %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1893 ret <4 x double> %b 1894} 1895 1896define <4 x i64> @shuffle_v4i64_0zzz_optsize(<4 x i64> %a) optsize { 1897; ALL-LABEL: shuffle_v4i64_0zzz_optsize: 1898; ALL: # %bb.0: 1899; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1900; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1901; ALL-NEXT: retq 1902 %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1903 ret <4 x i64> %b 1904} 1905 1906define <8 x float> @shuffle_v8f32_0zzzzzzz_optsize(<8 x float> %a) optsize { 1907; ALL-LABEL: shuffle_v8f32_0zzzzzzz_optsize: 1908; ALL: # %bb.0: 1909; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 1910; ALL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1911; ALL-NEXT: retq 1912 %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1913 ret <8 x float> %b 1914} 1915 1916define <8 x i32> @shuffle_v8i32_0zzzzzzz_optsize(<8 x i32> %a) optsize { 1917; ALL-LABEL: shuffle_v8i32_0zzzzzzz_optsize: 1918; ALL: # %bb.0: 1919; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 1920; ALL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1921; ALL-NEXT: retq 1922 %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1923 ret <8 x i32> %b 1924} 1925