1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 4 5define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 6; ALL-LABEL: shuffle_v8f32_45670123: 7; ALL: ## BB#0: ## %entry 8; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 9; ALL-NEXT: retq 10entry: 11 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 12 ret <8 x float> %shuffle 13} 14 15define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 16; ALL-LABEL: shuffle_v8f32_45670123_mem: 17; ALL: ## BB#0: ## %entry 18; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] 19; ALL-NEXT: retq 20entry: 21 %a = load <8 x float>, <8 x float>* %pa 22 %b = load <8 x float>, <8 x float>* %pb 23 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 24 ret <8 x float> %shuffle 25} 26 27define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 28; ALL-LABEL: shuffle_v8f32_0123cdef: 29; ALL: ## BB#0: ## %entry 30; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 31; ALL-NEXT: retq 32entry: 33 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 34 ret <8 x float> %shuffle 35} 36 37define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 38; AVX1-LABEL: shuffle_v8f32_01230123: 39; AVX1: ## BB#0: ## %entry 40; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 41; AVX1-NEXT: retq 42; 43; AVX2-LABEL: shuffle_v8f32_01230123: 44; AVX2: ## BB#0: ## %entry 45; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 46; AVX2-NEXT: retq 47entry: 48 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 49 ret <8 x float> %shuffle 50} 51 52define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 53; AVX1-LABEL: shuffle_v8f32_01230123_mem: 54; AVX1: ## BB#0: ## %entry 55; AVX1-NEXT: vmovaps (%rdi), %ymm0 56; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 57; AVX1-NEXT: retq 58; 59; AVX2-LABEL: shuffle_v8f32_01230123_mem: 60; AVX2: ## BB#0: ## %entry 61; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,1] 62; AVX2-NEXT: retq 63entry: 64 %a = load <8 x float>, <8 x float>* %pa 65 %b = load <8 x float>, <8 x float>* %pb 66 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 67 ret <8 x float> %shuffle 68} 69 70define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 71; ALL-LABEL: shuffle_v8f32_45674567: 72; ALL: ## BB#0: ## %entry 73; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 74; ALL-NEXT: retq 75entry: 76 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 77 ret <8 x float> %shuffle 78} 79 80define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 81; ALL-LABEL: shuffle_v8f32_45674567_mem: 82; ALL: ## BB#0: ## %entry 83; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3] 84; ALL-NEXT: retq 85entry: 86 %a = load <8 x float>, <8 x float>* %pa 87 %b = load <8 x float>, <8 x float>* %pb 88 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 89 ret <8 x float> %shuffle 90} 91 92define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { 93; ALL-LABEL: shuffle_v32i8_2323: 94; ALL: ## BB#0: ## %entry 95; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 96; ALL-NEXT: retq 97entry: 98 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 99 ret <32 x i8> %shuffle 100} 101 102define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { 103; AVX1-LABEL: shuffle_v32i8_2323_domain: 104; AVX1: ## BB#0: ## %entry 105; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 106; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 107; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 108; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 109; AVX1-NEXT: retq 110; 111; AVX2-LABEL: shuffle_v32i8_2323_domain: 112; AVX2: ## BB#0: ## %entry 113; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 114; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 115; AVX2-NEXT: retq 116entry: 117 ; add forces execution domain 118 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 119 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 120 ret <32 x i8> %shuffle 121} 122 123define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 124; ALL-LABEL: shuffle_v4i64_6701: 125; ALL: ## BB#0: ## %entry 126; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 127; ALL-NEXT: retq 128entry: 129 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 130 ret <4 x i64> %shuffle 131} 132 133define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 134; AVX1-LABEL: shuffle_v4i64_6701_domain: 135; AVX1: ## BB#0: ## %entry 136; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 137; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 138; AVX1-NEXT: retq 139; 140; AVX2-LABEL: shuffle_v4i64_6701_domain: 141; AVX2: ## BB#0: ## %entry 142; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 143; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 144; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 145; AVX2-NEXT: retq 146entry: 147 ; add forces execution domain 148 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> 149 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 150 ret <4 x i64> %shuffle 151} 152 153define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { 154; AVX1-LABEL: shuffle_v8i32_u5u7cdef: 155; AVX1: ## BB#0: ## %entry 156; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 157; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 158; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 159; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 160; AVX1-NEXT: retq 161; 162; AVX2-LABEL: shuffle_v8i32_u5u7cdef: 163; AVX2: ## BB#0: ## %entry 164; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 165; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 166; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 167; AVX2-NEXT: retq 168entry: 169 ; add forces execution domain 170 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 171 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15> 172 ret <8 x i32> %shuffle 173} 174 175define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { 176; AVX1-LABEL: shuffle_v16i16_4501: 177; AVX1: ## BB#0: ## %entry 178; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 179; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 180; AVX1-NEXT: retq 181; 182; AVX2-LABEL: shuffle_v16i16_4501: 183; AVX2: ## BB#0: ## %entry 184; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 185; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 186; AVX2-NEXT: retq 187entry: 188 ; add forces execution domain 189 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 190 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 191 ret <16 x i16> %shuffle 192} 193 194define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { 195; AVX1-LABEL: shuffle_v16i16_4501_mem: 196; AVX1: ## BB#0: ## %entry 197; AVX1-NEXT: vmovdqa (%rdi), %ymm0 198; AVX1-NEXT: vmovaps (%rsi), %ymm1 199; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 200; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 201; AVX1-NEXT: retq 202; 203; AVX2-LABEL: shuffle_v16i16_4501_mem: 204; AVX2: ## BB#0: ## %entry 205; AVX2-NEXT: vmovdqa (%rdi), %ymm0 206; AVX2-NEXT: vmovdqa (%rsi), %ymm1 207; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 208; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 209; AVX2-NEXT: retq 210entry: 211 %c = load <16 x i16>, <16 x i16>* %a 212 %d = load <16 x i16>, <16 x i16>* %b 213 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 214 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 215 ret <16 x i16> %shuffle 216} 217 218;;;; Cases with undef indicies mixed in the mask 219 220define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 221; ALL-LABEL: shuffle_v8f32_uu67u9ub: 222; ALL: ## BB#0: ## %entry 223; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 224; ALL-NEXT: retq 225entry: 226 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11> 227 ret <8 x float> %shuffle 228} 229 230define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 231; ALL-LABEL: shuffle_v8f32_uu67uu67: 232; ALL: ## BB#0: ## %entry 233; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 234; ALL-NEXT: retq 235entry: 236 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7> 237 ret <8 x float> %shuffle 238} 239 240define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 241; ALL-LABEL: shuffle_v8f32_uu67uuab: 242; ALL: ## BB#0: ## %entry 243; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 244; ALL-NEXT: retq 245entry: 246 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11> 247 ret <8 x float> %shuffle 248} 249 250define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 251; ALL-LABEL: shuffle_v8f32_uu67uuef: 252; ALL: ## BB#0: ## %entry 253; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 254; ALL-NEXT: retq 255entry: 256 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15> 257 ret <8 x float> %shuffle 258} 259 260define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 261; ALL-LABEL: shuffle_v8f32_uu674567: 262; ALL: ## BB#0: ## %entry 263; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 264; ALL-NEXT: retq 265entry: 266 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 267 ret <8 x float> %shuffle 268} 269 270define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 271; ALL-LABEL: shuffle_v8f32_uu6789ab: 272; ALL: ## BB#0: ## %entry 273; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 274; ALL-NEXT: retq 275entry: 276 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 277 ret <8 x float> %shuffle 278} 279 280define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 281; ALL-LABEL: shuffle_v8f32_4567uu67: 282; ALL: ## BB#0: ## %entry 283; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 284; ALL-NEXT: retq 285entry: 286 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7> 287 ret <8 x float> %shuffle 288} 289 290define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 291; ALL-LABEL: shuffle_v8f32_4567uuef: 292; ALL: ## BB#0: ## %entry 293; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 294; ALL-NEXT: retq 295entry: 296 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15> 297 ret <8 x float> %shuffle 298} 299 300;;;; Cases we must not select vperm2f128 301 302define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 303; ALL-LABEL: shuffle_v8f32_uu67ucuf: 304; ALL: ## BB#0: ## %entry 305; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 306; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] 307; ALL-NEXT: retq 308entry: 309 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15> 310 ret <8 x float> %shuffle 311} 312 313;; Test zero mask generation. 314;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984 315;; Prefer xor+vblendpd over vperm2f128 because that has better performance. 316;; TODO: When building for optsize we should use vperm2f128. 317 318define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) { 319; ALL-LABEL: shuffle_v4f64_zz01: 320; ALL: ## BB#0: 321; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 322; ALL-NEXT: retq 323 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 324 ret <4 x double> %s 325} 326define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize { 327; ALL-LABEL: shuffle_v4f64_zz01_optsize: 328; ALL: ## BB#0: 329; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 330; ALL-NEXT: retq 331 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 332 ret <4 x double> %s 333} 334 335define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) { 336; ALL-LABEL: shuffle_v4f64_zz23: 337; ALL: ## BB#0: 338; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 339; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] 340; ALL-NEXT: retq 341 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 342 ret <4 x double> %s 343} 344define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize { 345; ALL-LABEL: shuffle_v4f64_zz23_optsize: 346; ALL: ## BB#0: 347; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 348; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] 349; ALL-NEXT: retq 350 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 351 ret <4 x double> %s 352} 353 354define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) { 355; ALL-LABEL: shuffle_v4f64_zz45: 356; ALL: ## BB#0: 357; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 358; ALL-NEXT: retq 359 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 360 ret <4 x double> %s 361} 362define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize { 363; ALL-LABEL: shuffle_v4f64_zz45_optsize: 364; ALL: ## BB#0: 365; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 366; ALL-NEXT: retq 367 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 368 ret <4 x double> %s 369} 370 371define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) { 372; ALL-LABEL: shuffle_v4f64_zz67: 373; ALL: ## BB#0: 374; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 375; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] 376; ALL-NEXT: retq 377 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 378 ret <4 x double> %s 379} 380define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize { 381; ALL-LABEL: shuffle_v4f64_zz67_optsize: 382; ALL: ## BB#0: 383; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 384; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] 385; ALL-NEXT: retq 386 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 387 ret <4 x double> %s 388} 389 390define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) { 391; ALL-LABEL: shuffle_v4f64_01zz: 392; ALL: ## BB#0: 393; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 394; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 395; ALL-NEXT: retq 396 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 397 ret <4 x double> %s 398} 399define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize { 400; ALL-LABEL: shuffle_v4f64_01zz_optsize: 401; ALL: ## BB#0: 402; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 403; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 404; ALL-NEXT: retq 405 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 406 ret <4 x double> %s 407} 408 409define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) { 410; ALL-LABEL: shuffle_v4f64_23zz: 411; ALL: ## BB#0: 412; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 413; ALL-NEXT: retq 414 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 415 ret <4 x double> %s 416} 417define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize { 418; ALL-LABEL: shuffle_v4f64_23zz_optsize: 419; ALL: ## BB#0: 420; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 421; ALL-NEXT: retq 422 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 423 ret <4 x double> %s 424} 425 426define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) { 427; ALL-LABEL: shuffle_v4f64_45zz: 428; ALL: ## BB#0: 429; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 430; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 431; ALL-NEXT: retq 432 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 433 ret <4 x double> %s 434} 435define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize { 436; ALL-LABEL: shuffle_v4f64_45zz_optsize: 437; ALL: ## BB#0: 438; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 439; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 440; ALL-NEXT: retq 441 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 442 ret <4 x double> %s 443} 444 445define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) { 446; ALL-LABEL: shuffle_v4f64_67zz: 447; ALL: ## BB#0: 448; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 449; ALL-NEXT: retq 450 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 451 ret <4 x double> %s 452} 453define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize { 454; ALL-LABEL: shuffle_v4f64_67zz_optsize: 455; ALL: ## BB#0: 456; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 457; ALL-NEXT: retq 458 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 459 ret <4 x double> %s 460} 461 462;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection. 463 464define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) { 465; AVX1-LABEL: shuffle_v4i64_67zz: 466; AVX1: ## BB#0: 467; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 468; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 469; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 470; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 471; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 472; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 473; AVX1-NEXT: retq 474; 475; AVX2-LABEL: shuffle_v4i64_67zz: 476; AVX2: ## BB#0: 477; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 478; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 479; AVX2-NEXT: retq 480 %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 481 %c = add <4 x i64> %b, %s 482 ret <4 x i64> %c 483} 484 485;;; Memory folding cases 486 487define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp { 488; AVX1-LABEL: ld0_hi0_lo1_4f64: 489; AVX1: ## BB#0: ## %entry 490; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 491; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 492; AVX1-NEXT: retq 493; 494; AVX2-LABEL: ld0_hi0_lo1_4f64: 495; AVX2: ## BB#0: ## %entry 496; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 497; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1 498; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 499; AVX2-NEXT: retq 500entry: 501 %a = load <4 x double>, <4 x double> * %pa 502 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 503 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0> 504 ret <4 x double> %res 505} 506 507define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp { 508; AVX1-LABEL: ld1_hi0_hi1_4f64: 509; AVX1: ## BB#0: ## %entry 510; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 511; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 512; AVX1-NEXT: retq 513; 514; AVX2-LABEL: ld1_hi0_hi1_4f64: 515; AVX2: ## BB#0: ## %entry 516; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 517; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1 518; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 519; AVX2-NEXT: retq 520entry: 521 %b = load <4 x double>, <4 x double> * %pb 522 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 523 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0> 524 ret <4 x double> %res 525} 526 527define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp { 528; AVX1-LABEL: ld0_hi0_lo1_8f32: 529; AVX1: ## BB#0: ## %entry 530; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 531; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 532; AVX1-NEXT: retq 533; 534; AVX2-LABEL: ld0_hi0_lo1_8f32: 535; AVX2: ## BB#0: ## %entry 536; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 537; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 538; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 539; AVX2-NEXT: retq 540entry: 541 %a = load <8 x float>, <8 x float> * %pa 542 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 543 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 544 ret <8 x float> %res 545} 546 547define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp { 548; AVX1-LABEL: ld1_hi0_hi1_8f32: 549; AVX1: ## BB#0: ## %entry 550; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 551; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 552; AVX1-NEXT: retq 553; 554; AVX2-LABEL: ld1_hi0_hi1_8f32: 555; AVX2: ## BB#0: ## %entry 556; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 557; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 558; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 559; AVX2-NEXT: retq 560entry: 561 %b = load <8 x float>, <8 x float> * %pb 562 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 563 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 564 ret <8 x float> %res 565} 566 567define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp { 568; AVX1-LABEL: ld0_hi0_lo1_4i64: 569; AVX1: ## BB#0: ## %entry 570; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 571; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 572; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 573; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 574; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 575; AVX1-NEXT: retq 576; 577; AVX2-LABEL: ld0_hi0_lo1_4i64: 578; AVX2: ## BB#0: ## %entry 579; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 580; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 581; AVX2-NEXT: retq 582entry: 583 %a = load <4 x i64>, <4 x i64> * %pa 584 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 585 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4> 586 ret <4 x i64> %res 587} 588 589define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp { 590; AVX1-LABEL: ld1_hi0_hi1_4i64: 591; AVX1: ## BB#0: ## %entry 592; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 593; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 594; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 595; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 596; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 597; AVX1-NEXT: retq 598; 599; AVX2-LABEL: ld1_hi0_hi1_4i64: 600; AVX2: ## BB#0: ## %entry 601; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 602; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 603; AVX2-NEXT: retq 604entry: 605 %b = load <4 x i64>, <4 x i64> * %pb 606 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 607 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4> 608 ret <4 x i64> %res 609} 610 611define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp { 612; AVX1-LABEL: ld0_hi0_lo1_8i32: 613; AVX1: ## BB#0: ## %entry 614; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 615; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 616; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4] 617; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 618; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 619; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 620; AVX1-NEXT: retq 621; 622; AVX2-LABEL: ld0_hi0_lo1_8i32: 623; AVX2: ## BB#0: ## %entry 624; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] 625; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 626; AVX2-NEXT: retq 627entry: 628 %a = load <8 x i32>, <8 x i32> * %pa 629 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 630 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 631 ret <8 x i32> %res 632} 633 634define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp { 635; AVX1-LABEL: ld1_hi0_hi1_8i32: 636; AVX1: ## BB#0: ## %entry 637; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 638; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 639; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4] 640; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 641; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 642; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 643; AVX1-NEXT: retq 644; 645; AVX2-LABEL: ld1_hi0_hi1_8i32: 646; AVX2: ## BB#0: ## %entry 647; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] 648; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 649; AVX2-NEXT: retq 650entry: 651 %b = load <8 x i32>, <8 x i32> * %pb 652 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 653 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 654 ret <8 x i32> %res 655} 656