1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -lower-matrix-intrinsics -fuse-matrix-use-loops -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -instcombine -verify-dom-info %s -S | FileCheck %s 3 4; REQUIRES: aarch64-registered-target 5 6target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 7target triple = "aarch64-apple-ios" 8 9define void @multiply_noalias_4x4(<16 x double>* noalias %A, <16 x double>* noalias %B, <16 x double>* noalias %C) { 10; CHECK-LABEL: @multiply_noalias_4x4( 11; CHECK-NEXT: entry: 12; CHECK-NEXT: br label [[COLS_HEADER:%.*]] 13; CHECK: cols.header: 14; CHECK-NEXT: [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ] 15; CHECK-NEXT: br label [[COLS_BODY:%.*]] 16; CHECK: cols.body: 17; CHECK-NEXT: br label [[ROWS_HEADER:%.*]] 18; CHECK: rows.header: 19; CHECK-NEXT: [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ] 20; CHECK-NEXT: br label [[ROWS_BODY:%.*]] 21; CHECK: rows.body: 22; CHECK-NEXT: br label [[INNER_HEADER:%.*]] 23; CHECK: inner.header: 24; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ] 25; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP9:%.*]], [[INNER_LATCH]] ] 26; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP11:%.*]], [[INNER_LATCH]] ] 27; CHECK-NEXT: br label [[INNER_BODY:%.*]] 28; CHECK: inner.body: 29; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[INNER_IV]], 2 30; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[ROWS_IV]] 31; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <16 x double>, <16 x double>* [[A:%.*]], i64 0, i64 [[TMP3]] 32; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast double* [[TMP4]] to <2 x double>* 33; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 34; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP4]], i64 4 35; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* 36; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 37; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[COLS_IV]], 2 38; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], [[INNER_IV]] 39; CHECK-NEXT: [[TMP7:%.*]] = getelementptr <16 x double>, <16 x double>* [[B:%.*]], i64 0, i64 [[TMP6]] 40; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast double* [[TMP7]] to <2 x double>* 41; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST4]], align 8 42; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP7]], i64 4 43; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>* 44; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8 45; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD5]], <2 x double> undef, <2 x i32> zeroinitializer 46; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[TMP0]]) 47; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[COL_LOAD5]], <2 x double> undef, <2 x i32> <i32 1, i32 1> 48; CHECK-NEXT: [[TMP9]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP8]]) 49; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD8]], <2 x double> undef, <2 x i32> zeroinitializer 50; CHECK-NEXT: [[TMP10:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP1]]) 51; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[COL_LOAD8]], <2 x double> undef, <2 x i32> <i32 1, i32 1> 52; CHECK-NEXT: [[TMP11]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP10]]) 53; CHECK-NEXT: br label [[INNER_LATCH]] 54; CHECK: inner.latch: 55; CHECK-NEXT: [[INNER_STEP]] = add i64 [[INNER_IV]], 2 56; CHECK-NEXT: [[INNER_COND_NOT:%.*]] = icmp eq i64 [[INNER_STEP]], 4 57; CHECK-NEXT: br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], !llvm.loop !0 58; CHECK: rows.latch: 59; CHECK-NEXT: [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2 60; CHECK-NEXT: [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_STEP]], 4 61; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[COLS_IV]], 2 62; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], [[ROWS_IV]] 63; CHECK-NEXT: [[TMP14:%.*]] = getelementptr <16 x double>, <16 x double>* [[C:%.*]], i64 0, i64 [[TMP13]] 64; CHECK-NEXT: [[VEC_CAST21:%.*]] = bitcast double* [[TMP14]] to <2 x double>* 65; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[VEC_CAST21]], align 8 66; CHECK-NEXT: [[VEC_GEP22:%.*]] = getelementptr double, double* [[TMP14]], i64 4 67; CHECK-NEXT: [[VEC_CAST23:%.*]] = bitcast double* [[VEC_GEP22]] to <2 x double>* 68; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[VEC_CAST23]], align 8 69; CHECK-NEXT: br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]] 70; CHECK: cols.latch: 71; CHECK-NEXT: [[COLS_STEP]] = add i64 [[COLS_IV]], 2 72; CHECK-NEXT: [[COLS_COND_NOT:%.*]] = icmp eq i64 [[COLS_STEP]], 4 73; CHECK-NEXT: br i1 [[COLS_COND_NOT]], label [[CONTINUE:%.*]], label [[COLS_HEADER]] 74; CHECK: continue: 75; CHECK-NEXT: ret void 76; 77 78entry: 79 %a = load <16 x double>, <16 x double>* %A, align 8 80 %b = load <16 x double>, <16 x double>* %B, align 8 81 82 %c = call <16 x double> @llvm.matrix.multiply.v16f64.v16f64.v16f64(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4) 83 84 store <16 x double> %c, <16 x double>* %C, align 8 85 ret void 86} 87 88 89declare <16 x double> @llvm.matrix.multiply.v16f64.v16f64.v16f64(<16 x double>, <16 x double>, i32, i32, i32) 90 91define void @multiply_noalias_2x4(<8 x i64>* noalias %A, <8 x i64>* noalias %B, <4 x i64>* noalias %C) { 92; CHECK-LABEL: @multiply_noalias_2x4( 93; CHECK-NEXT: entry: 94; CHECK-NEXT: br label [[COLS_HEADER:%.*]] 95; CHECK: cols.header: 96; CHECK-NEXT: [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ] 97; CHECK-NEXT: br label [[COLS_BODY:%.*]] 98; CHECK: cols.body: 99; CHECK-NEXT: br label [[ROWS_HEADER:%.*]] 100; CHECK: rows.header: 101; CHECK-NEXT: [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ] 102; CHECK-NEXT: br label [[ROWS_BODY:%.*]] 103; CHECK: rows.body: 104; CHECK-NEXT: br label [[INNER_HEADER:%.*]] 105; CHECK: inner.header: 106; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ] 107; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP11:%.*]], [[INNER_LATCH]] ] 108; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP15:%.*]], [[INNER_LATCH]] ] 109; CHECK-NEXT: br label [[INNER_BODY:%.*]] 110; CHECK: inner.body: 111; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[INNER_IV]], 1 112; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[ROWS_IV]] 113; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <8 x i64>, <8 x i64>* [[A:%.*]], i64 0, i64 [[TMP3]] 114; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>* 115; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[VEC_CAST]], align 8 116; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i64, i64* [[TMP4]], i64 2 117; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast i64* [[VEC_GEP]] to <2 x i64>* 118; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i64>, <2 x i64>* [[VEC_CAST1]], align 8 119; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[COLS_IV]], 2 120; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], [[INNER_IV]] 121; CHECK-NEXT: [[TMP7:%.*]] = getelementptr <8 x i64>, <8 x i64>* [[B:%.*]], i64 0, i64 [[TMP6]] 122; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast i64* [[TMP7]] to <2 x i64>* 123; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <2 x i64>, <2 x i64>* [[VEC_CAST4]], align 8 124; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr i64, i64* [[TMP7]], i64 4 125; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast i64* [[VEC_GEP6]] to <2 x i64>* 126; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <2 x i64>, <2 x i64>* [[VEC_CAST7]], align 8 127; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x i64> [[COL_LOAD5]], <2 x i64> undef, <2 x i32> zeroinitializer 128; CHECK-NEXT: [[TMP8:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT]] 129; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP0]], [[TMP8]] 130; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x i64> [[COL_LOAD5]], <2 x i64> undef, <2 x i32> <i32 1, i32 1> 131; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[COL_LOAD2]], [[SPLAT_SPLAT12]] 132; CHECK-NEXT: [[TMP11]] = add <2 x i64> [[TMP9]], [[TMP10]] 133; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x i64> [[COL_LOAD8]], <2 x i64> undef, <2 x i32> zeroinitializer 134; CHECK-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT16]] 135; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i64> [[TMP1]], [[TMP12]] 136; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x i64> [[COL_LOAD8]], <2 x i64> undef, <2 x i32> <i32 1, i32 1> 137; CHECK-NEXT: [[TMP14:%.*]] = mul <2 x i64> [[COL_LOAD2]], [[SPLAT_SPLAT19]] 138; CHECK-NEXT: [[TMP15]] = add <2 x i64> [[TMP13]], [[TMP14]] 139; CHECK-NEXT: br label [[INNER_LATCH]] 140; CHECK: inner.latch: 141; CHECK-NEXT: [[INNER_STEP]] = add i64 [[INNER_IV]], 2 142; CHECK-NEXT: [[INNER_COND_NOT:%.*]] = icmp eq i64 [[INNER_STEP]], 4 143; CHECK-NEXT: br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], !llvm.loop !2 144; CHECK: rows.latch: 145; CHECK-NEXT: [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2 146; CHECK-NEXT: [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_IV]], 0 147; CHECK-NEXT: [[TMP16:%.*]] = shl i64 [[COLS_IV]], 1 148; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], [[ROWS_IV]] 149; CHECK-NEXT: [[TMP18:%.*]] = getelementptr <4 x i64>, <4 x i64>* [[C:%.*]], i64 0, i64 [[TMP17]] 150; CHECK-NEXT: [[VEC_CAST21:%.*]] = bitcast i64* [[TMP18]] to <2 x i64>* 151; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[VEC_CAST21]], align 8 152; CHECK-NEXT: [[VEC_GEP22:%.*]] = getelementptr i64, i64* [[TMP18]], i64 2 153; CHECK-NEXT: [[VEC_CAST23:%.*]] = bitcast i64* [[VEC_GEP22]] to <2 x i64>* 154; CHECK-NEXT: store <2 x i64> [[TMP15]], <2 x i64>* [[VEC_CAST23]], align 8 155; CHECK-NEXT: br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]] 156; CHECK: cols.latch: 157; CHECK-NEXT: [[COLS_STEP]] = add i64 [[COLS_IV]], 2 158; CHECK-NEXT: [[COLS_COND_NOT:%.*]] = icmp eq i64 [[COLS_IV]], 0 159; CHECK-NEXT: br i1 [[COLS_COND_NOT]], label [[CONTINUE:%.*]], label [[COLS_HEADER]] 160; CHECK: continue: 161; CHECK-NEXT: ret void 162; 163 164; In the inner loop, compute 165; Result += Load(A, ROWS_IV, INNER_IV) * Load(B, INNER_IV, COLS_IV) 166 167 168; Store the current 2x2 tile. 169 170entry: 171 %a = load <8 x i64>, <8 x i64>* %A, align 8 172 %b = load <8 x i64>, <8 x i64>* %B, align 8 173 174 %c = call <4 x i64> @llvm.matrix.multiply.v4i64.v8i64.v8i64(<8 x i64> %a, <8 x i64> %b, i32 2, i32 4, i32 2) 175 176 store <4 x i64> %c, <4 x i64>* %C, align 8 177 ret void 178} 179 180 181declare <4 x i64> @llvm.matrix.multiply.v4i64.v8i64.v8i64(<8 x i64>, <8 x i64>, i32, i32, i32) 182 183define void @multiply_noalias_4x2_2x8(<8 x i64>* noalias %A, <16 x i64>* noalias %B, <32 x i64>* noalias %C) { 184; CHECK-LABEL: @multiply_noalias_4x2_2x8( 185; CHECK-NEXT: entry: 186; CHECK-NEXT: br label [[COLS_HEADER:%.*]] 187; CHECK: cols.header: 188; CHECK-NEXT: [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ] 189; CHECK-NEXT: br label [[COLS_BODY:%.*]] 190; CHECK: cols.body: 191; CHECK-NEXT: br label [[ROWS_HEADER:%.*]] 192; CHECK: rows.header: 193; CHECK-NEXT: [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ] 194; CHECK-NEXT: br label [[ROWS_BODY:%.*]] 195; CHECK: rows.body: 196; CHECK-NEXT: br label [[INNER_HEADER:%.*]] 197; CHECK: inner.header: 198; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ] 199; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP11:%.*]], [[INNER_LATCH]] ] 200; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP15:%.*]], [[INNER_LATCH]] ] 201; CHECK-NEXT: br label [[INNER_BODY:%.*]] 202; CHECK: inner.body: 203; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[INNER_IV]], 2 204; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[ROWS_IV]] 205; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <8 x i64>, <8 x i64>* [[A:%.*]], i64 0, i64 [[TMP3]] 206; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>* 207; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[VEC_CAST]], align 8 208; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i64, i64* [[TMP4]], i64 4 209; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast i64* [[VEC_GEP]] to <2 x i64>* 210; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i64>, <2 x i64>* [[VEC_CAST1]], align 8 211; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[COLS_IV]], 1 212; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], [[INNER_IV]] 213; CHECK-NEXT: [[TMP7:%.*]] = getelementptr <16 x i64>, <16 x i64>* [[B:%.*]], i64 0, i64 [[TMP6]] 214; CHECK-NEXT: [[VEC_CAST4:%.*]] = bitcast i64* [[TMP7]] to <2 x i64>* 215; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <2 x i64>, <2 x i64>* [[VEC_CAST4]], align 8 216; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr i64, i64* [[TMP7]], i64 2 217; CHECK-NEXT: [[VEC_CAST7:%.*]] = bitcast i64* [[VEC_GEP6]] to <2 x i64>* 218; CHECK-NEXT: [[COL_LOAD8:%.*]] = load <2 x i64>, <2 x i64>* [[VEC_CAST7]], align 8 219; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x i64> [[COL_LOAD5]], <2 x i64> undef, <2 x i32> zeroinitializer 220; CHECK-NEXT: [[TMP8:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT]] 221; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP0]], [[TMP8]] 222; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x i64> [[COL_LOAD5]], <2 x i64> undef, <2 x i32> <i32 1, i32 1> 223; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[COL_LOAD2]], [[SPLAT_SPLAT12]] 224; CHECK-NEXT: [[TMP11]] = add <2 x i64> [[TMP9]], [[TMP10]] 225; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x i64> [[COL_LOAD8]], <2 x i64> undef, <2 x i32> zeroinitializer 226; CHECK-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT16]] 227; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i64> [[TMP1]], [[TMP12]] 228; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x i64> [[COL_LOAD8]], <2 x i64> undef, <2 x i32> <i32 1, i32 1> 229; CHECK-NEXT: [[TMP14:%.*]] = mul <2 x i64> [[COL_LOAD2]], [[SPLAT_SPLAT19]] 230; CHECK-NEXT: [[TMP15]] = add <2 x i64> [[TMP13]], [[TMP14]] 231; CHECK-NEXT: br label [[INNER_LATCH]] 232; CHECK: inner.latch: 233; CHECK-NEXT: [[INNER_STEP]] = add i64 [[INNER_IV]], 2 234; CHECK-NEXT: [[INNER_COND_NOT:%.*]] = icmp eq i64 [[INNER_IV]], 0 235; CHECK-NEXT: br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], !llvm.loop !3 236; CHECK: rows.latch: 237; CHECK-NEXT: [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2 238; CHECK-NEXT: [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_STEP]], 4 239; CHECK-NEXT: [[TMP16:%.*]] = shl i64 [[COLS_IV]], 2 240; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], [[ROWS_IV]] 241; CHECK-NEXT: [[TMP18:%.*]] = getelementptr <32 x i64>, <32 x i64>* [[C:%.*]], i64 0, i64 [[TMP17]] 242; CHECK-NEXT: [[VEC_CAST21:%.*]] = bitcast i64* [[TMP18]] to <2 x i64>* 243; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[VEC_CAST21]], align 8 244; CHECK-NEXT: [[VEC_GEP22:%.*]] = getelementptr i64, i64* [[TMP18]], i64 4 245; CHECK-NEXT: [[VEC_CAST23:%.*]] = bitcast i64* [[VEC_GEP22]] to <2 x i64>* 246; CHECK-NEXT: store <2 x i64> [[TMP15]], <2 x i64>* [[VEC_CAST23]], align 8 247; CHECK-NEXT: br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]] 248; CHECK: cols.latch: 249; CHECK-NEXT: [[COLS_STEP]] = add i64 [[COLS_IV]], 2 250; CHECK-NEXT: [[COLS_COND_NOT:%.*]] = icmp eq i64 [[COLS_STEP]], 8 251; CHECK-NEXT: br i1 [[COLS_COND_NOT]], label [[CONTINUE:%.*]], label [[COLS_HEADER]] 252; CHECK: continue: 253; CHECK-NEXT: ret void 254; 255 256; In the inner loop, compute 257; Result += Load(A, ROWS_IV, INNER_IV) * Load(B, INNER_IV, COLS_IV) 258 259 260; Store the current 2x2 tile. 261 262entry: 263 %a = load <8 x i64>, <8 x i64>* %A, align 8 264 %b = load <16 x i64>, <16 x i64>* %B, align 8 265 266 %c = call <32 x i64> @llvm.matrix.multiply.v32i64.v8i64.v16i64(<8 x i64> %a, <16 x i64> %b, i32 4, i32 2, i32 8) 267 268 store <32 x i64> %c, <32 x i64>* %C, align 8 269 ret void 270} 271 272declare <32 x i64> @llvm.matrix.multiply.v32i64.v8i64.v16i64(<8 x i64>, <16 x i64>, i32, i32, i32) 273 274 275; Check the runtime aliasing checks. 276define void @multiply_alias_2x2(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) { 277; CHECK-LABEL: @multiply_alias_2x2( 278; CHECK-NEXT: entry: 279; CHECK-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint <4 x float>* [[C:%.*]] to i64 280; CHECK-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 16 281; CHECK-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint <4 x float>* [[A:%.*]] to i64 282; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]] 283; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] 284; CHECK: alias_cont: 285; CHECK-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 16 286; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]] 287; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] 288; CHECK: copy: 289; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x float>, align 16 290; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float>* [[TMP2]] to i8* 291; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float>* [[A]] to i8* 292; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 16 dereferenceable(16) [[TMP3]], i8* nonnull align 8 dereferenceable(16) [[TMP4]], i64 16, i1 false) 293; CHECK-NEXT: br label [[NO_ALIAS]] 294; CHECK: no_alias: 295; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x float>* [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] 296; CHECK-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint <4 x float>* [[C]] to i64 297; CHECK-NEXT: [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 16 298; CHECK-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint <4 x float>* [[B:%.*]] to i64 299; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]] 300; CHECK-NEXT: br i1 [[TMP6]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]] 301; CHECK: alias_cont1: 302; CHECK-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 16 303; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]] 304; CHECK-NEXT: br i1 [[TMP7]], label [[COPY2:%.*]], label [[NO_ALIAS3]] 305; CHECK: copy2: 306; CHECK-NEXT: [[TMP8:%.*]] = alloca <4 x float>, align 16 307; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x float>* [[TMP8]] to i8* 308; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x float>* [[B]] to i8* 309; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 16 dereferenceable(16) [[TMP9]], i8* nonnull align 8 dereferenceable(16) [[TMP10]], i64 16, i1 false) 310; CHECK-NEXT: br label [[NO_ALIAS3]] 311; CHECK: no_alias3: 312; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x float>* [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT1]] ], [ [[TMP8]], [[COPY2]] ] 313; CHECK-NEXT: br label [[COLS_HEADER:%.*]] 314; CHECK: cols.header: 315; CHECK-NEXT: [[COLS_IV:%.*]] = phi i64 [ 0, [[NO_ALIAS3]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ] 316; CHECK-NEXT: br label [[COLS_BODY:%.*]] 317; CHECK: cols.body: 318; CHECK-NEXT: br label [[ROWS_HEADER:%.*]] 319; CHECK: rows.header: 320; CHECK-NEXT: [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ] 321; CHECK-NEXT: br label [[ROWS_BODY:%.*]] 322; CHECK: rows.body: 323; CHECK-NEXT: br label [[INNER_HEADER:%.*]] 324; CHECK: inner.header: 325; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ] 326; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x float> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP21:%.*]], [[INNER_LATCH]] ] 327; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x float> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP23:%.*]], [[INNER_LATCH]] ] 328; CHECK-NEXT: br label [[INNER_BODY:%.*]] 329; CHECK: inner.body: 330; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[INNER_IV]], 1 331; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[ROWS_IV]] 332; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <4 x float>, <4 x float>* [[TMP5]], i64 0, i64 [[TMP15]] 333; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast float* [[TMP16]] to <2 x float>* 334; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST]], align 4 335; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, float* [[TMP16]], i64 2 336; CHECK-NEXT: [[VEC_CAST8:%.*]] = bitcast float* [[VEC_GEP]] to <2 x float>* 337; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST8]], align 4 338; CHECK-NEXT: [[TMP17:%.*]] = shl i64 [[COLS_IV]], 1 339; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], [[INNER_IV]] 340; CHECK-NEXT: [[TMP19:%.*]] = getelementptr <4 x float>, <4 x float>* [[TMP11]], i64 0, i64 [[TMP18]] 341; CHECK-NEXT: [[VEC_CAST11:%.*]] = bitcast float* [[TMP19]] to <2 x float>* 342; CHECK-NEXT: [[COL_LOAD12:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST11]], align 4 343; CHECK-NEXT: [[VEC_GEP13:%.*]] = getelementptr float, float* [[TMP19]], i64 2 344; CHECK-NEXT: [[VEC_CAST14:%.*]] = bitcast float* [[VEC_GEP13]] to <2 x float>* 345; CHECK-NEXT: [[COL_LOAD15:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST14]], align 4 346; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[COL_LOAD12]], <2 x float> undef, <2 x i32> zeroinitializer 347; CHECK-NEXT: [[TMP20:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD]], <2 x float> [[SPLAT_SPLAT]], <2 x float> [[TMP12]]) 348; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x float> [[COL_LOAD12]], <2 x float> undef, <2 x i32> <i32 1, i32 1> 349; CHECK-NEXT: [[TMP21]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD9]], <2 x float> [[SPLAT_SPLAT19]], <2 x float> [[TMP20]]) 350; CHECK-NEXT: [[SPLAT_SPLAT23:%.*]] = shufflevector <2 x float> [[COL_LOAD15]], <2 x float> undef, <2 x i32> zeroinitializer 351; CHECK-NEXT: [[TMP22:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD]], <2 x float> [[SPLAT_SPLAT23]], <2 x float> [[TMP13]]) 352; CHECK-NEXT: [[SPLAT_SPLAT26:%.*]] = shufflevector <2 x float> [[COL_LOAD15]], <2 x float> undef, <2 x i32> <i32 1, i32 1> 353; CHECK-NEXT: [[TMP23]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD9]], <2 x float> [[SPLAT_SPLAT26]], <2 x float> [[TMP22]]) 354; CHECK-NEXT: br label [[INNER_LATCH]] 355; CHECK: inner.latch: 356; CHECK-NEXT: [[INNER_STEP]] = add i64 [[INNER_IV]], 2 357; CHECK-NEXT: [[INNER_COND_NOT:%.*]] = icmp eq i64 [[INNER_IV]], 0 358; CHECK-NEXT: br i1 [[INNER_COND_NOT]], label [[ROWS_LATCH]], label [[INNER_HEADER]], !llvm.loop !5 359; CHECK: rows.latch: 360; CHECK-NEXT: [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2 361; CHECK-NEXT: [[ROWS_COND_NOT:%.*]] = icmp eq i64 [[ROWS_IV]], 0 362; CHECK-NEXT: [[TMP24:%.*]] = shl i64 [[COLS_IV]], 1 363; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], [[ROWS_IV]] 364; CHECK-NEXT: [[TMP26:%.*]] = getelementptr <4 x float>, <4 x float>* [[C]], i64 0, i64 [[TMP25]] 365; CHECK-NEXT: [[VEC_CAST28:%.*]] = bitcast float* [[TMP26]] to <2 x float>* 366; CHECK-NEXT: store <2 x float> [[TMP21]], <2 x float>* [[VEC_CAST28]], align 8 367; CHECK-NEXT: [[VEC_GEP29:%.*]] = getelementptr float, float* [[TMP26]], i64 2 368; CHECK-NEXT: [[VEC_CAST30:%.*]] = bitcast float* [[VEC_GEP29]] to <2 x float>* 369; CHECK-NEXT: store <2 x float> [[TMP23]], <2 x float>* [[VEC_CAST30]], align 8 370; CHECK-NEXT: br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]] 371; CHECK: cols.latch: 372; CHECK-NEXT: [[COLS_STEP]] = add i64 [[COLS_IV]], 2 373; CHECK-NEXT: [[COLS_COND_NOT:%.*]] = icmp eq i64 [[COLS_IV]], 0 374; CHECK-NEXT: br i1 [[COLS_COND_NOT]], label [[CONTINUE:%.*]], label [[COLS_HEADER]] 375; CHECK: continue: 376; CHECK-NEXT: ret void 377; 378 379; First, check for aliasing at runtime, create non-aliasing copies if required. 380entry: 381 %a = load <4 x float>, <4 x float>* %A, align 8 382 %b = load <4 x float>, <4 x float>* %B, align 8 383 384 %c = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %a, <4 x float> %b, i32 2, i32 2, i32 2) 385 386 store <4 x float> %c, <4 x float>* %C, align 8 387 ret void 388} 389 390declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32) 391 392; CHECK: !0 = distinct !{!0, !1} 393; CHECK-NEXT: !1 = !{!"llvm.loop.unroll.count", i32 2} 394; CHECK-NEXT: !2 = distinct !{!2, !1} 395; CHECK-NEXT: !3 = distinct !{!3, !4} 396; CHECK-NEXT: !4 = !{!"llvm.loop.unroll.count", i32 1} 397; CHECK-NEXT: !5 = distinct !{!5, !4} 398