1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK 3 4define <4 x i32> @test_pmaddwd_v8i16_add_v4i32(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { 5; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32: 6; CHECK: # %bb.0: 7; CHECK-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 8; CHECK-NEXT: retq 9 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 10 %2 = add <4 x i32> %1, %a0 11 ret <4 x i32> %2 12} 13 14define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { 15; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32_commute: 16; CHECK: # %bb.0: 17; CHECK-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 18; CHECK-NEXT: retq 19 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 20 %2 = add <4 x i32> %a0, %1 21 ret <4 x i32> %2 22} 23 24define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_load1(<4 x i32> %a0, <8 x i16>* %p1, <8 x i16> %a2) { 25; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32_load1: 26; CHECK: # %bb.0: 27; CHECK-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0 28; CHECK-NEXT: retq 29 %a1 = load <8 x i16>, <8 x i16>* %p1 30 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 31 %2 = add <4 x i32> %1, %a0 32 ret <4 x i32> %2 33} 34 35define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_load2(<4 x i32> %a0, <8 x i16> %a1, <8 x i16>* %p2) { 36; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32_load2: 37; CHECK: # %bb.0: 38; CHECK-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0 39; CHECK-NEXT: retq 40 %a2 = load <8 x i16>, <8 x i16>* %p2 41 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 42 %2 = add <4 x i32> %1, %a0 43 ret <4 x i32> %2 44} 45 46define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute_load1(<4 x i32> %a0, <8 x i16>* %p1, <8 x i16> %a2) { 47; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load1: 48; CHECK: # %bb.0: 49; CHECK-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0 50; CHECK-NEXT: retq 51 %a1 = load <8 x i16>, <8 x i16>* %p1 52 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 53 %2 = add <4 x i32> %a0, %1 54 ret <4 x i32> %2 55} 56 57define <4 x i32> @test_pmaddwd_v8i16_add_v4i32_commute_load2(<4 x i32> %a0, <8 x i16> %a1, <8 x i16>* %p2) { 58; CHECK-LABEL: test_pmaddwd_v8i16_add_v4i32_commute_load2: 59; CHECK: # %bb.0: 60; CHECK-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0 61; CHECK-NEXT: retq 62 %a2 = load <8 x i16>, <8 x i16>* %p2 63 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> %a2) 64 %2 = add <4 x i32> %a0, %1 65 ret <4 x i32> %2 66} 67 68define <8 x i32> @test_pmaddwd_v16i16_add_v8i32(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { 69; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32: 70; CHECK: # %bb.0: 71; CHECK-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 72; CHECK-NEXT: retq 73 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 74 %2 = add <8 x i32> %1, %a0 75 ret <8 x i32> %2 76} 77 78define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { 79; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32_commute: 80; CHECK: # %bb.0: 81; CHECK-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 82; CHECK-NEXT: retq 83 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 84 %2 = add <8 x i32> %a0, %1 85 ret <8 x i32> %2 86} 87 88define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_load1(<8 x i32> %a0, <16 x i16>* %p1, <16 x i16> %a2) { 89; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32_load1: 90; CHECK: # %bb.0: 91; CHECK-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0 92; CHECK-NEXT: retq 93 %a1 = load <16 x i16>, <16 x i16>* %p1 94 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 95 %2 = add <8 x i32> %1, %a0 96 ret <8 x i32> %2 97} 98 99define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_load2(<8 x i32> %a0, <16 x i16> %a1, <16 x i16>* %p2) { 100; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32_load2: 101; CHECK: # %bb.0: 102; CHECK-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0 103; CHECK-NEXT: retq 104 %a2 = load <16 x i16>, <16 x i16>* %p2 105 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 106 %2 = add <8 x i32> %1, %a0 107 ret <8 x i32> %2 108} 109 110define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute_load1(<8 x i32> %a0, <16 x i16>* %p1, <16 x i16> %a2) { 111; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load1: 112; CHECK: # %bb.0: 113; CHECK-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0 114; CHECK-NEXT: retq 115 %a1 = load <16 x i16>, <16 x i16>* %p1 116 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 117 %2 = add <8 x i32> %a0, %1 118 ret <8 x i32> %2 119} 120 121define <8 x i32> @test_pmaddwd_v16i16_add_v8i32_commute_load2(<8 x i32> %a0, <16 x i16> %a1, <16 x i16>* %p2) { 122; CHECK-LABEL: test_pmaddwd_v16i16_add_v8i32_commute_load2: 123; CHECK: # %bb.0: 124; CHECK-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0 125; CHECK-NEXT: retq 126 %a2 = load <16 x i16>, <16 x i16>* %p2 127 %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a1, <16 x i16> %a2) 128 %2 = add <8 x i32> %a0, %1 129 ret <8 x i32> %2 130} 131 132define <16 x i32> @test_pmaddwd_v32i16_add_v16i32(<16 x i32> %a0, <32 x i16> %a1, <32 x i16> %a2) { 133; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32: 134; CHECK: # %bb.0: 135; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 136; CHECK-NEXT: retq 137 %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2) 138 %2 = add <16 x i32> %1, %a0 139 ret <16 x i32> %2 140} 141 142define <16 x i32> @test_pmaddwd_v32i16_add_v16i32_commute(<16 x i32> %a0, <32 x i16> %a1, <32 x i16> %a2) { 143; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32_commute: 144; CHECK: # %bb.0: 145; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 146; CHECK-NEXT: retq 147 %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2) 148 %2 = add <16 x i32> %a0, %1 149 ret <16 x i32> %2 150} 151 152define <16 x i32> @test_pmaddwd_v32i16_add_v16i32_load1(<16 x i32> %a0, <32 x i16>* %p1, <32 x i16> %a2) { 153; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32_load1: 154; CHECK: # %bb.0: 155; CHECK-NEXT: vpdpwssd (%rdi), %zmm1, %zmm0 156; CHECK-NEXT: retq 157 %a1 = load <32 x i16>, <32 x i16>* %p1 158 %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2) 159 %2 = add <16 x i32> %1, %a0 160 ret <16 x i32> %2 161} 162 163define <16 x i32> @test_pmaddwd_v32i16_add_v16i32_load2(<16 x i32> %a0, <32 x i16> %a1, <32 x i16>* %p2) { 164; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32_load2: 165; CHECK: # %bb.0: 166; CHECK-NEXT: vpdpwssd (%rdi), %zmm1, %zmm0 167; CHECK-NEXT: retq 168 %a2 = load <32 x i16>, <32 x i16>* %p2 169 %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2) 170 %2 = add <16 x i32> %1, %a0 171 ret <16 x i32> %2 172} 173 174define <16 x i32> @test_pmaddwd_v32i16_add_v16i32_commute_load1(<16 x i32> %a0, <32 x i16>* %p1, <32 x i16> %a2) { 175; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32_commute_load1: 176; CHECK: # %bb.0: 177; CHECK-NEXT: vpdpwssd (%rdi), %zmm1, %zmm0 178; CHECK-NEXT: retq 179 %a1 = load <32 x i16>, <32 x i16>* %p1 180 %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2) 181 %2 = add <16 x i32> %a0, %1 182 ret <16 x i32> %2 183} 184 185define <16 x i32> @test_pmaddwd_v32i16_add_v16i32_commute_load2(<16 x i32> %a0, <32 x i16> %a1, <32 x i16>* %p2) { 186; CHECK-LABEL: test_pmaddwd_v32i16_add_v16i32_commute_load2: 187; CHECK: # %bb.0: 188; CHECK-NEXT: vpdpwssd (%rdi), %zmm1, %zmm0 189; CHECK-NEXT: retq 190 %a2 = load <32 x i16>, <32 x i16>* %p2 191 %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a2) 192 %2 = add <16 x i32> %a0, %1 193 ret <16 x i32> %2 194} 195 196declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) 197declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) 198declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) 199