1; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 2 3;AVX1-NOT: llvm.masked 4 5target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 6target triple = "x86_64-pc_linux" 7 8; The source code: 9; 10;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) { 11; 12; for (int i=0; i < SIZE; ++i) { 13; if (trigger[i] > 0) { 14; out[i] = in[index[i]] + (float) 0.5; 15; } 16; } 17;} 18 19;AVX512-LABEL: @foo1 20;AVX512: llvm.masked.load.v16i32 21;AVX512: llvm.masked.gather.v16f32 22;AVX512: llvm.masked.store.v16f32 23;AVX512: ret void 24 25; Function Attrs: nounwind uwtable 26define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) { 27entry: 28 %in.addr = alloca float*, align 8 29 %out.addr = alloca float*, align 8 30 %trigger.addr = alloca i32*, align 8 31 %index.addr = alloca i32*, align 8 32 %i = alloca i32, align 4 33 store float* %in, float** %in.addr, align 8 34 store float* %out, float** %out.addr, align 8 35 store i32* %trigger, i32** %trigger.addr, align 8 36 store i32* %index, i32** %index.addr, align 8 37 store i32 0, i32* %i, align 4 38 br label %for.cond 39 40for.cond: ; preds = %for.inc, %entry 41 %0 = load i32, i32* %i, align 4 42 %cmp = icmp slt i32 %0, 4096 43 br i1 %cmp, label %for.body, label %for.end 44 45for.body: ; preds = %for.cond 46 %1 = load i32, i32* %i, align 4 47 %idxprom = sext i32 %1 to i64 48 %2 = load i32*, i32** %trigger.addr, align 8 49 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom 50 %3 = load i32, i32* %arrayidx, align 4 51 %cmp1 = icmp sgt i32 %3, 0 52 br i1 %cmp1, label %if.then, label %if.end 53 54if.then: ; preds = %for.body 55 %4 = load i32, i32* %i, align 4 56 %idxprom2 = sext i32 %4 to i64 57 %5 = load i32*, i32** %index.addr, align 8 58 %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2 59 %6 = load i32, i32* %arrayidx3, align 4 60 %idxprom4 = sext i32 %6 to i64 61 %7 = load float*, float** %in.addr, align 8 62 %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4 63 %8 = load float, float* %arrayidx5, align 4 64 %add = fadd float %8, 5.000000e-01 65 %9 = load i32, i32* %i, align 4 66 %idxprom6 = sext i32 %9 to i64 67 %10 = load float*, float** %out.addr, align 8 68 %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6 69 store float %add, float* %arrayidx7, align 4 70 br label %if.end 71 72if.end: ; preds = %if.then, %for.body 73 br label %for.inc 74 75for.inc: ; preds = %if.end 76 %11 = load i32, i32* %i, align 4 77 %inc = add nsw i32 %11, 1 78 store i32 %inc, i32* %i, align 4 79 br label %for.cond 80 81for.end: ; preds = %for.cond 82 ret void 83} 84 85; The source code 86;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) { 87; 88; for (int i=0; i<SIZE; ++i) { 89; if (trigger[i] > 0) { 90; out[i] = in[i].b + (float) 0.5; 91; } 92; } 93;} 94 95%struct.In = type { float, float } 96 97;AVX512-LABEL: @foo2 98;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1 99;AVX512: llvm.masked.gather.v16f32 100;AVX512: llvm.masked.store.v16f32 101;AVX512: ret void 102define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { 103entry: 104 %in.addr = alloca %struct.In*, align 8 105 %out.addr = alloca float*, align 8 106 %trigger.addr = alloca i32*, align 8 107 %index.addr = alloca i32*, align 8 108 %i = alloca i32, align 4 109 store %struct.In* %in, %struct.In** %in.addr, align 8 110 store float* %out, float** %out.addr, align 8 111 store i32* %trigger, i32** %trigger.addr, align 8 112 store i32* %index, i32** %index.addr, align 8 113 store i32 0, i32* %i, align 4 114 br label %for.cond 115 116for.cond: ; preds = %for.inc, %entry 117 %0 = load i32, i32* %i, align 4 118 %cmp = icmp slt i32 %0, 4096 119 br i1 %cmp, label %for.body, label %for.end 120 121for.body: ; preds = %for.cond 122 %1 = load i32, i32* %i, align 4 123 %idxprom = sext i32 %1 to i64 124 %2 = load i32*, i32** %trigger.addr, align 8 125 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom 126 %3 = load i32, i32* %arrayidx, align 4 127 %cmp1 = icmp sgt i32 %3, 0 128 br i1 %cmp1, label %if.then, label %if.end 129 130if.then: ; preds = %for.body 131 %4 = load i32, i32* %i, align 4 132 %idxprom2 = sext i32 %4 to i64 133 %5 = load %struct.In*, %struct.In** %in.addr, align 8 134 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2 135 %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1 136 %6 = load float, float* %b, align 4 137 %add = fadd float %6, 5.000000e-01 138 %7 = load i32, i32* %i, align 4 139 %idxprom4 = sext i32 %7 to i64 140 %8 = load float*, float** %out.addr, align 8 141 %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4 142 store float %add, float* %arrayidx5, align 4 143 br label %if.end 144 145if.end: ; preds = %if.then, %for.body 146 br label %for.inc 147 148for.inc: ; preds = %if.end 149 %9 = load i32, i32* %i, align 4 150 %inc = add nsw i32 %9, 1 151 store i32 %inc, i32* %i, align 4 152 br label %for.cond 153 154for.end: ; preds = %for.cond 155 ret void 156} 157 158; The source code 159;struct Out { 160; float a; 161; float b; 162;}; 163;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) { 164; 165; for (int i=0; i<SIZE; ++i) { 166; if (trigger[i] > 0) { 167; out[i].b = in[i].b + (float) 0.5; 168; } 169; } 170;} 171 172;AVX512-LABEL: @foo3 173;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1 174;AVX512: llvm.masked.gather.v16f32 175;AVX512: fadd <16 x float> 176;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1 177;AVX512: llvm.masked.scatter.v16f32 178;AVX512: ret void 179 180%struct.Out = type { float, float } 181 182define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { 183entry: 184 %in.addr = alloca %struct.In*, align 8 185 %out.addr = alloca %struct.Out*, align 8 186 %trigger.addr = alloca i32*, align 8 187 %i = alloca i32, align 4 188 store %struct.In* %in, %struct.In** %in.addr, align 8 189 store %struct.Out* %out, %struct.Out** %out.addr, align 8 190 store i32* %trigger, i32** %trigger.addr, align 8 191 store i32 0, i32* %i, align 4 192 br label %for.cond 193 194for.cond: ; preds = %for.inc, %entry 195 %0 = load i32, i32* %i, align 4 196 %cmp = icmp slt i32 %0, 4096 197 br i1 %cmp, label %for.body, label %for.end 198 199for.body: ; preds = %for.cond 200 %1 = load i32, i32* %i, align 4 201 %idxprom = sext i32 %1 to i64 202 %2 = load i32*, i32** %trigger.addr, align 8 203 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom 204 %3 = load i32, i32* %arrayidx, align 4 205 %cmp1 = icmp sgt i32 %3, 0 206 br i1 %cmp1, label %if.then, label %if.end 207 208if.then: ; preds = %for.body 209 %4 = load i32, i32* %i, align 4 210 %idxprom2 = sext i32 %4 to i64 211 %5 = load %struct.In*, %struct.In** %in.addr, align 8 212 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2 213 %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1 214 %6 = load float, float* %b, align 4 215 %add = fadd float %6, 5.000000e-01 216 %7 = load i32, i32* %i, align 4 217 %idxprom4 = sext i32 %7 to i64 218 %8 = load %struct.Out*, %struct.Out** %out.addr, align 8 219 %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4 220 %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1 221 store float %add, float* %b6, align 4 222 br label %if.end 223 224if.end: ; preds = %if.then, %for.body 225 br label %for.inc 226 227for.inc: ; preds = %if.end 228 %9 = load i32, i32* %i, align 4 229 %inc = add nsw i32 %9, 1 230 store i32 %inc, i32* %i, align 4 231 br label %for.cond 232 233for.end: ; preds = %for.cond 234 ret void 235} 236declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>) 237