1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 4 5define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 6; AVX1-LABEL: shuffle_v8f32_45670123: 7; AVX1: # %bb.0: # %entry 8; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 9; AVX1-NEXT: retq 10; 11; AVX2-LABEL: shuffle_v8f32_45670123: 12; AVX2: # %bb.0: # %entry 13; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 14; AVX2-NEXT: retq 15entry: 16 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 17 ret <8 x float> %shuffle 18} 19 20define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 21; AVX1-LABEL: shuffle_v8f32_45670123_mem: 22; AVX1: # %bb.0: # %entry 23; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] 24; AVX1-NEXT: retq 25; 26; AVX2-LABEL: shuffle_v8f32_45670123_mem: 27; AVX2: # %bb.0: # %entry 28; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,0,1] 29; AVX2-NEXT: retq 30entry: 31 %a = load <8 x float>, <8 x float>* %pa 32 %b = load <8 x float>, <8 x float>* %pb 33 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 34 ret <8 x float> %shuffle 35} 36 37define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 38; ALL-LABEL: shuffle_v8f32_0123cdef: 39; ALL: # %bb.0: # %entry 40; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 41; ALL-NEXT: retq 42entry: 43 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 44 ret <8 x float> %shuffle 45} 46 47define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 48; AVX1-LABEL: shuffle_v8f32_01230123: 49; AVX1: # %bb.0: # %entry 50; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 51; AVX1-NEXT: retq 52; 53; AVX2-LABEL: shuffle_v8f32_01230123: 54; AVX2: # %bb.0: # %entry 55; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 56; AVX2-NEXT: retq 57entry: 58 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 59 ret <8 x float> %shuffle 60} 61 62define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 63; AVX1-LABEL: shuffle_v8f32_01230123_mem: 64; AVX1: # %bb.0: # %entry 65; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1] 66; AVX1-NEXT: retq 67; 68; AVX2-LABEL: shuffle_v8f32_01230123_mem: 69; AVX2: # %bb.0: # %entry 70; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,1] 71; AVX2-NEXT: retq 72entry: 73 %a = load <8 x float>, <8 x float>* %pa 74 %b = load <8 x float>, <8 x float>* %pb 75 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 76 ret <8 x float> %shuffle 77} 78 79define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 80; AVX1-LABEL: shuffle_v8f32_45674567: 81; AVX1: # %bb.0: # %entry 82; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 83; AVX1-NEXT: retq 84; 85; AVX2-LABEL: shuffle_v8f32_45674567: 86; AVX2: # %bb.0: # %entry 87; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] 88; AVX2-NEXT: retq 89entry: 90 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 91 ret <8 x float> %shuffle 92} 93 94define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 95; AVX1-LABEL: shuffle_v8f32_45674567_mem: 96; AVX1: # %bb.0: # %entry 97; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3] 98; AVX1-NEXT: retq 99; 100; AVX2-LABEL: shuffle_v8f32_45674567_mem: 101; AVX2: # %bb.0: # %entry 102; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,2,3] 103; AVX2-NEXT: retq 104entry: 105 %a = load <8 x float>, <8 x float>* %pa 106 %b = load <8 x float>, <8 x float>* %pb 107 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 108 ret <8 x float> %shuffle 109} 110 111define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { 112; AVX1-LABEL: shuffle_v32i8_2323: 113; AVX1: # %bb.0: # %entry 114; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 115; AVX1-NEXT: retq 116; 117; AVX2-LABEL: shuffle_v32i8_2323: 118; AVX2: # %bb.0: # %entry 119; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] 120; AVX2-NEXT: retq 121entry: 122 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 123 ret <32 x i8> %shuffle 124} 125 126define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { 127; AVX1-LABEL: shuffle_v32i8_2323_domain: 128; AVX1: # %bb.0: # %entry 129; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 130; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 131; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 132; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 133; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 134; AVX1-NEXT: retq 135; 136; AVX2-LABEL: shuffle_v32i8_2323_domain: 137; AVX2: # %bb.0: # %entry 138; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 139; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 140; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 141; AVX2-NEXT: retq 142entry: 143 ; add forces execution domain 144 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 145 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 146 ret <32 x i8> %shuffle 147} 148 149define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 150; ALL-LABEL: shuffle_v4i64_6701: 151; ALL: # %bb.0: # %entry 152; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 153; ALL-NEXT: retq 154entry: 155 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 156 ret <4 x i64> %shuffle 157} 158 159define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 160; AVX1-LABEL: shuffle_v4i64_6701_domain: 161; AVX1: # %bb.0: # %entry 162; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 163; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 164; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 165; AVX1-NEXT: retq 166; 167; AVX2-LABEL: shuffle_v4i64_6701_domain: 168; AVX2: # %bb.0: # %entry 169; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 170; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 171; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 172; AVX2-NEXT: retq 173entry: 174 ; add forces execution domain 175 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> 176 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 177 ret <4 x i64> %shuffle 178} 179 180define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { 181; AVX1-LABEL: shuffle_v8i32_u5u7cdef: 182; AVX1: # %bb.0: # %entry 183; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 184; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 185; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 186; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 187; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 188; AVX1-NEXT: retq 189; 190; AVX2-LABEL: shuffle_v8i32_u5u7cdef: 191; AVX2: # %bb.0: # %entry 192; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 193; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 194; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 195; AVX2-NEXT: retq 196entry: 197 ; add forces execution domain 198 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 199 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15> 200 ret <8 x i32> %shuffle 201} 202 203define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { 204; AVX1-LABEL: shuffle_v16i16_4501: 205; AVX1: # %bb.0: # %entry 206; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 207; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 208; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 209; AVX1-NEXT: retq 210; 211; AVX2-LABEL: shuffle_v16i16_4501: 212; AVX2: # %bb.0: # %entry 213; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 214; AVX2-NEXT: vpsubw %ymm2, %ymm0, %ymm0 215; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 216; AVX2-NEXT: retq 217entry: 218 ; add forces execution domain 219 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 220 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 221 ret <16 x i16> %shuffle 222} 223 224define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { 225; AVX1-LABEL: shuffle_v16i16_4501_mem: 226; AVX1: # %bb.0: # %entry 227; AVX1-NEXT: vmovdqa (%rdi), %ymm0 228; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 229; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 230; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] 231; AVX1-NEXT: retq 232; 233; AVX2-LABEL: shuffle_v16i16_4501_mem: 234; AVX2: # %bb.0: # %entry 235; AVX2-NEXT: vmovdqa (%rdi), %ymm0 236; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 237; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 238; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] 239; AVX2-NEXT: retq 240entry: 241 %c = load <16 x i16>, <16 x i16>* %a 242 %d = load <16 x i16>, <16 x i16>* %b 243 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 244 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 245 ret <16 x i16> %shuffle 246} 247 248;;;; Cases with undef indicies mixed in the mask 249 250define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 251; ALL-LABEL: shuffle_v8f32_uu67u9ub: 252; ALL: # %bb.0: # %entry 253; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 254; ALL-NEXT: retq 255entry: 256 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11> 257 ret <8 x float> %shuffle 258} 259 260define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 261; AVX1-LABEL: shuffle_v8f32_uu67uu67: 262; AVX1: # %bb.0: # %entry 263; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 264; AVX1-NEXT: retq 265; 266; AVX2-LABEL: shuffle_v8f32_uu67uu67: 267; AVX2: # %bb.0: # %entry 268; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] 269; AVX2-NEXT: retq 270entry: 271 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7> 272 ret <8 x float> %shuffle 273} 274 275define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 276; ALL-LABEL: shuffle_v8f32_uu67uuab: 277; ALL: # %bb.0: # %entry 278; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 279; ALL-NEXT: retq 280entry: 281 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11> 282 ret <8 x float> %shuffle 283} 284 285define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 286; ALL-LABEL: shuffle_v8f32_uu67uuef: 287; ALL: # %bb.0: # %entry 288; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 289; ALL-NEXT: retq 290entry: 291 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15> 292 ret <8 x float> %shuffle 293} 294 295define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 296; AVX1-LABEL: shuffle_v8f32_uu674567: 297; AVX1: # %bb.0: # %entry 298; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 299; AVX1-NEXT: retq 300; 301; AVX2-LABEL: shuffle_v8f32_uu674567: 302; AVX2: # %bb.0: # %entry 303; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] 304; AVX2-NEXT: retq 305entry: 306 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 307 ret <8 x float> %shuffle 308} 309 310define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 311; ALL-LABEL: shuffle_v8f32_uu6789ab: 312; ALL: # %bb.0: # %entry 313; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 314; ALL-NEXT: retq 315entry: 316 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 317 ret <8 x float> %shuffle 318} 319 320define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 321; AVX1-LABEL: shuffle_v8f32_4567uu67: 322; AVX1: # %bb.0: # %entry 323; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 324; AVX1-NEXT: retq 325; 326; AVX2-LABEL: shuffle_v8f32_4567uu67: 327; AVX2: # %bb.0: # %entry 328; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] 329; AVX2-NEXT: retq 330entry: 331 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7> 332 ret <8 x float> %shuffle 333} 334 335define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 336; ALL-LABEL: shuffle_v8f32_4567uuef: 337; ALL: # %bb.0: # %entry 338; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 339; ALL-NEXT: retq 340entry: 341 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15> 342 ret <8 x float> %shuffle 343} 344 345;;;; Cases we must not select vperm2f128 346 347define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 348; ALL-LABEL: shuffle_v8f32_uu67ucuf: 349; ALL: # %bb.0: # %entry 350; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 351; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] 352; ALL-NEXT: retq 353entry: 354 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15> 355 ret <8 x float> %shuffle 356} 357 358;; Test zero mask generation. 359;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984 360;; Prefer xor+vblendpd over vperm2f128 because that has better performance. 361;; TODO: When building for optsize we should use vperm2f128. 362 363define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) { 364; ALL-LABEL: shuffle_v4f64_zz01: 365; ALL: # %bb.0: 366; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 367; ALL-NEXT: retq 368 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 369 ret <4 x double> %s 370} 371define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize { 372; ALL-LABEL: shuffle_v4f64_zz01_optsize: 373; ALL: # %bb.0: 374; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 375; ALL-NEXT: retq 376 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 377 ret <4 x double> %s 378} 379 380define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) { 381; ALL-LABEL: shuffle_v4f64_zz23: 382; ALL: # %bb.0: 383; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 384; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 385; ALL-NEXT: retq 386 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 387 ret <4 x double> %s 388} 389define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize { 390; ALL-LABEL: shuffle_v4f64_zz23_optsize: 391; ALL: # %bb.0: 392; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 393; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 394; ALL-NEXT: retq 395 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 396 ret <4 x double> %s 397} 398 399define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) { 400; ALL-LABEL: shuffle_v4f64_zz45: 401; ALL: # %bb.0: 402; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 403; ALL-NEXT: retq 404 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 405 ret <4 x double> %s 406} 407define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize { 408; ALL-LABEL: shuffle_v4f64_zz45_optsize: 409; ALL: # %bb.0: 410; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 411; ALL-NEXT: retq 412 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 413 ret <4 x double> %s 414} 415 416define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) { 417; ALL-LABEL: shuffle_v4f64_zz67: 418; ALL: # %bb.0: 419; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 420; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 421; ALL-NEXT: retq 422 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 423 ret <4 x double> %s 424} 425define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize { 426; ALL-LABEL: shuffle_v4f64_zz67_optsize: 427; ALL: # %bb.0: 428; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 429; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 430; ALL-NEXT: retq 431 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 432 ret <4 x double> %s 433} 434 435define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) { 436; ALL-LABEL: shuffle_v4f64_01zz: 437; ALL: # %bb.0: 438; ALL-NEXT: vmovaps %xmm0, %xmm0 439; ALL-NEXT: retq 440 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 441 ret <4 x double> %s 442} 443define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize { 444; ALL-LABEL: shuffle_v4f64_01zz_optsize: 445; ALL: # %bb.0: 446; ALL-NEXT: vmovaps %xmm0, %xmm0 447; ALL-NEXT: retq 448 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 449 ret <4 x double> %s 450} 451 452define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) { 453; ALL-LABEL: shuffle_v4f64_23zz: 454; ALL: # %bb.0: 455; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 456; ALL-NEXT: retq 457 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 458 ret <4 x double> %s 459} 460define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize { 461; ALL-LABEL: shuffle_v4f64_23zz_optsize: 462; ALL: # %bb.0: 463; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 464; ALL-NEXT: retq 465 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 466 ret <4 x double> %s 467} 468 469define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) { 470; ALL-LABEL: shuffle_v4f64_45zz: 471; ALL: # %bb.0: 472; ALL-NEXT: vmovaps %xmm0, %xmm0 473; ALL-NEXT: retq 474 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 475 ret <4 x double> %s 476} 477define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize { 478; ALL-LABEL: shuffle_v4f64_45zz_optsize: 479; ALL: # %bb.0: 480; ALL-NEXT: vmovaps %xmm0, %xmm0 481; ALL-NEXT: retq 482 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 483 ret <4 x double> %s 484} 485 486define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) { 487; ALL-LABEL: shuffle_v4f64_67zz: 488; ALL: # %bb.0: 489; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 490; ALL-NEXT: retq 491 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 492 ret <4 x double> %s 493} 494define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize { 495; ALL-LABEL: shuffle_v4f64_67zz_optsize: 496; ALL: # %bb.0: 497; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 498; ALL-NEXT: retq 499 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 500 ret <4 x double> %s 501} 502 503;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection. 504 505define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) { 506; AVX1-LABEL: shuffle_v4i64_67zz: 507; AVX1: # %bb.0: 508; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 509; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 510; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 511; AVX1-NEXT: retq 512; 513; AVX2-LABEL: shuffle_v4i64_67zz: 514; AVX2: # %bb.0: 515; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 516; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 517; AVX2-NEXT: retq 518 %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 519 %c = add <4 x i64> %b, %s 520 ret <4 x i64> %c 521} 522 523;;; Memory folding cases 524 525define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp { 526; AVX1-LABEL: ld0_hi0_lo1_4f64: 527; AVX1: # %bb.0: # %entry 528; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 529; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 530; AVX1-NEXT: retq 531; 532; AVX2-LABEL: ld0_hi0_lo1_4f64: 533; AVX2: # %bb.0: # %entry 534; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 535; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] 536; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 537; AVX2-NEXT: retq 538entry: 539 %a = load <4 x double>, <4 x double> * %pa 540 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 541 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0> 542 ret <4 x double> %res 543} 544 545define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp { 546; AVX1-LABEL: ld1_hi0_hi1_4f64: 547; AVX1: # %bb.0: # %entry 548; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 549; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 550; AVX1-NEXT: retq 551; 552; AVX2-LABEL: ld1_hi0_hi1_4f64: 553; AVX2: # %bb.0: # %entry 554; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 555; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] 556; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 557; AVX2-NEXT: retq 558entry: 559 %b = load <4 x double>, <4 x double> * %pb 560 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 561 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0> 562 ret <4 x double> %res 563} 564 565define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp { 566; AVX1-LABEL: ld0_hi0_lo1_8f32: 567; AVX1: # %bb.0: # %entry 568; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 569; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 570; AVX1-NEXT: retq 571; 572; AVX2-LABEL: ld0_hi0_lo1_8f32: 573; AVX2: # %bb.0: # %entry 574; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 575; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] 576; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 577; AVX2-NEXT: retq 578entry: 579 %a = load <8 x float>, <8 x float> * %pa 580 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 581 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 582 ret <8 x float> %res 583} 584 585define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp { 586; AVX1-LABEL: ld1_hi0_hi1_8f32: 587; AVX1: # %bb.0: # %entry 588; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 589; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 590; AVX1-NEXT: retq 591; 592; AVX2-LABEL: ld1_hi0_hi1_8f32: 593; AVX2: # %bb.0: # %entry 594; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 595; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] 596; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 597; AVX2-NEXT: retq 598entry: 599 %b = load <8 x float>, <8 x float> * %pb 600 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 601 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 602 ret <8 x float> %res 603} 604 605define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp { 606; AVX1-LABEL: ld0_hi0_lo1_4i64: 607; AVX1: # %bb.0: # %entry 608; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 609; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 610; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 611; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 612; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 613; AVX1-NEXT: retq 614; 615; AVX2-LABEL: ld0_hi0_lo1_4i64: 616; AVX2: # %bb.0: # %entry 617; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 618; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 619; AVX2-NEXT: retq 620entry: 621 %a = load <4 x i64>, <4 x i64> * %pa 622 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 623 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4> 624 ret <4 x i64> %res 625} 626 627define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp { 628; AVX1-LABEL: ld1_hi0_hi1_4i64: 629; AVX1: # %bb.0: # %entry 630; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 631; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 632; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 633; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 634; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 635; AVX1-NEXT: retq 636; 637; AVX2-LABEL: ld1_hi0_hi1_4i64: 638; AVX2: # %bb.0: # %entry 639; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 640; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 641; AVX2-NEXT: retq 642entry: 643 %b = load <4 x i64>, <4 x i64> * %pb 644 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 645 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4> 646 ret <4 x i64> %res 647} 648 649define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp { 650; AVX1-LABEL: ld0_hi0_lo1_8i32: 651; AVX1: # %bb.0: # %entry 652; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 653; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 654; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4] 655; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 656; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 657; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 658; AVX1-NEXT: retq 659; 660; AVX2-LABEL: ld0_hi0_lo1_8i32: 661; AVX2: # %bb.0: # %entry 662; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 663; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 664; AVX2-NEXT: retq 665entry: 666 %a = load <8 x i32>, <8 x i32> * %pa 667 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 668 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 669 ret <8 x i32> %res 670} 671 672define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp { 673; AVX1-LABEL: ld1_hi0_hi1_8i32: 674; AVX1: # %bb.0: # %entry 675; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 676; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 677; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4] 678; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 679; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 680; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 681; AVX1-NEXT: retq 682; 683; AVX2-LABEL: ld1_hi0_hi1_8i32: 684; AVX2: # %bb.0: # %entry 685; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 686; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 687; AVX2-NEXT: retq 688entry: 689 %b = load <8 x i32>, <8 x i32> * %pb 690 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 691 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 692 ret <8 x i32> %res 693} 694