• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -lower-matrix-intrinsics -fuse-matrix-use-loops -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s
3
4; REQUIRES: aarch64-registered-target
5
6target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
7target triple = "aarch64-apple-ios"
8
9define void @multiply_all_volatile(<4 x double>* noalias %A, <4 x double>* noalias %B, <4 x double>* noalias %C) {
10; CHECK-LABEL: @multiply_all_volatile(
11; CHECK-NEXT:  entry:
12; CHECK-NEXT:    br label [[COLS_HEADER:%.*]]
13; CHECK:       cols.header:
14; CHECK-NEXT:    [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ]
15; CHECK-NEXT:    br label [[COLS_BODY:%.*]]
16; CHECK:       cols.body:
17; CHECK-NEXT:    br label [[ROWS_HEADER:%.*]]
18; CHECK:       rows.header:
19; CHECK-NEXT:    [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ]
20; CHECK-NEXT:    br label [[ROWS_BODY:%.*]]
21; CHECK:       rows.body:
22; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
23; CHECK:       inner.header:
24; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
25; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP17:%.*]], [[INNER_LATCH]] ]
26; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP23:%.*]], [[INNER_LATCH]] ]
27; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
28; CHECK:       inner.body:
29; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[INNER_IV]], 2
30; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[ROWS_IV]]
31; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double>* [[A:%.*]] to double*
32; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, double* [[TMP4]], i64 [[TMP3]]
33; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP5]] to <4 x double>*
34; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double>* [[COL_CAST]] to double*
35; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
36; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
37; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP6]], i64 2
38; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
39; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
40; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[COLS_IV]], 2
41; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], [[INNER_IV]]
42; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x double>* [[B:%.*]] to double*
43; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr double, double* [[TMP9]], i64 [[TMP8]]
44; CHECK-NEXT:    [[COL_CAST3:%.*]] = bitcast double* [[TMP10]] to <4 x double>*
45; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x double>* [[COL_CAST3]] to double*
46; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[TMP11]] to <2 x double>*
47; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST4]], align 8
48; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP11]], i64 2
49; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>*
50; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8
51; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
52; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
53; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 0
54; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> undef, double [[TMP12]], i32 0
55; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> undef, <2 x i32> zeroinitializer
56; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
57; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
58; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 1
59; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> undef, double [[TMP14]], i32 0
60; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> undef, <2 x i32> zeroinitializer
61; CHECK-NEXT:    [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP13]])
62; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
63; CHECK-NEXT:    [[TMP17]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 3>
64; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
65; CHECK-NEXT:    [[BLOCK14:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
66; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 0
67; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <2 x double> undef, double [[TMP18]], i32 0
68; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT15]], <2 x double> undef, <2 x i32> zeroinitializer
69; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
70; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
71; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 1
72; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <2 x double> undef, double [[TMP20]], i32 0
73; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT18]], <2 x double> undef, <2 x i32> zeroinitializer
74; CHECK-NEXT:    [[TMP21:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP19]])
75; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP21]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
76; CHECK-NEXT:    [[TMP23]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP22]], <2 x i32> <i32 2, i32 3>
77; CHECK-NEXT:    br label [[INNER_LATCH]]
78; CHECK:       inner.latch:
79; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
80; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
81; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop !0
82; CHECK:       rows.latch:
83; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
84; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
85; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[COLS_IV]], 2
86; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], [[ROWS_IV]]
87; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x double>* [[C:%.*]] to double*
88; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, double* [[TMP26]], i64 [[TMP25]]
89; CHECK-NEXT:    [[COL_CAST20:%.*]] = bitcast double* [[TMP27]] to <4 x double>*
90; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x double>* [[COL_CAST20]] to double*
91; CHECK-NEXT:    [[VEC_CAST21:%.*]] = bitcast double* [[TMP28]] to <2 x double>*
92; CHECK-NEXT:    store volatile <2 x double> [[TMP17]], <2 x double>* [[VEC_CAST21]], align 8
93; CHECK-NEXT:    [[VEC_GEP22:%.*]] = getelementptr double, double* [[TMP28]], i64 2
94; CHECK-NEXT:    [[VEC_CAST23:%.*]] = bitcast double* [[VEC_GEP22]] to <2 x double>*
95; CHECK-NEXT:    store volatile <2 x double> [[TMP23]], <2 x double>* [[VEC_CAST23]], align 8
96; CHECK-NEXT:    br i1 [[ROWS_COND]], label [[ROWS_HEADER]], label [[COLS_LATCH]]
97; CHECK:       cols.latch:
98; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
99; CHECK-NEXT:    [[COLS_COND:%.*]] = icmp ne i64 [[COLS_STEP]], 2
100; CHECK-NEXT:    br i1 [[COLS_COND]], label [[COLS_HEADER]], label [[CONTINUE:%.*]]
101; CHECK:       continue:
102; CHECK-NEXT:    ret void
103;
104
105
106entry:
107  %a = load volatile <4 x double>, <4 x double>* %A, align 8
108  %b = load volatile <4 x double>, <4 x double>* %B, align 8
109
110  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
111
112  store volatile <4 x double> %c, <4 x double>* %C, align 8
113  ret void
114}
115
116
117define void @multiply_load0_volatile(<4 x double>* noalias %A, <4 x double>* noalias %B, <4 x double>* noalias %C) {
118; CHECK-LABEL: @multiply_load0_volatile(
119; CHECK-NEXT:  entry:
120; CHECK-NEXT:    br label [[COLS_HEADER:%.*]]
121; CHECK:       cols.header:
122; CHECK-NEXT:    [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ]
123; CHECK-NEXT:    br label [[COLS_BODY:%.*]]
124; CHECK:       cols.body:
125; CHECK-NEXT:    br label [[ROWS_HEADER:%.*]]
126; CHECK:       rows.header:
127; CHECK-NEXT:    [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ]
128; CHECK-NEXT:    br label [[ROWS_BODY:%.*]]
129; CHECK:       rows.body:
130; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
131; CHECK:       inner.header:
132; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
133; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP17:%.*]], [[INNER_LATCH]] ]
134; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP23:%.*]], [[INNER_LATCH]] ]
135; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
136; CHECK:       inner.body:
137; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[INNER_IV]], 2
138; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[ROWS_IV]]
139; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double>* [[A:%.*]] to double*
140; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, double* [[TMP4]], i64 [[TMP3]]
141; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP5]] to <4 x double>*
142; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double>* [[COL_CAST]] to double*
143; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
144; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
145; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP6]], i64 2
146; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
147; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
148; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[COLS_IV]], 2
149; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], [[INNER_IV]]
150; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x double>* [[B:%.*]] to double*
151; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr double, double* [[TMP9]], i64 [[TMP8]]
152; CHECK-NEXT:    [[COL_CAST3:%.*]] = bitcast double* [[TMP10]] to <4 x double>*
153; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x double>* [[COL_CAST3]] to double*
154; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[TMP11]] to <2 x double>*
155; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST4]], align 8
156; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP11]], i64 2
157; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>*
158; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8
159; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
160; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
161; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 0
162; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> undef, double [[TMP12]], i32 0
163; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> undef, <2 x i32> zeroinitializer
164; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
165; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
166; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 1
167; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> undef, double [[TMP14]], i32 0
168; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> undef, <2 x i32> zeroinitializer
169; CHECK-NEXT:    [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP13]])
170; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
171; CHECK-NEXT:    [[TMP17]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 3>
172; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
173; CHECK-NEXT:    [[BLOCK14:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
174; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 0
175; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <2 x double> undef, double [[TMP18]], i32 0
176; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT15]], <2 x double> undef, <2 x i32> zeroinitializer
177; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
178; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
179; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 1
180; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <2 x double> undef, double [[TMP20]], i32 0
181; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT18]], <2 x double> undef, <2 x i32> zeroinitializer
182; CHECK-NEXT:    [[TMP21:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP19]])
183; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP21]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
184; CHECK-NEXT:    [[TMP23]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP22]], <2 x i32> <i32 2, i32 3>
185; CHECK-NEXT:    br label [[INNER_LATCH]]
186; CHECK:       inner.latch:
187; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
188; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
189; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop !2
190; CHECK:       rows.latch:
191; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
192; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
193; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[COLS_IV]], 2
194; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], [[ROWS_IV]]
195; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x double>* [[C:%.*]] to double*
196; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, double* [[TMP26]], i64 [[TMP25]]
197; CHECK-NEXT:    [[COL_CAST20:%.*]] = bitcast double* [[TMP27]] to <4 x double>*
198; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x double>* [[COL_CAST20]] to double*
199; CHECK-NEXT:    [[VEC_CAST21:%.*]] = bitcast double* [[TMP28]] to <2 x double>*
200; CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[VEC_CAST21]], align 8
201; CHECK-NEXT:    [[VEC_GEP22:%.*]] = getelementptr double, double* [[TMP28]], i64 2
202; CHECK-NEXT:    [[VEC_CAST23:%.*]] = bitcast double* [[VEC_GEP22]] to <2 x double>*
203; CHECK-NEXT:    store <2 x double> [[TMP23]], <2 x double>* [[VEC_CAST23]], align 8
204; CHECK-NEXT:    br i1 [[ROWS_COND]], label [[ROWS_HEADER]], label [[COLS_LATCH]]
205; CHECK:       cols.latch:
206; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
207; CHECK-NEXT:    [[COLS_COND:%.*]] = icmp ne i64 [[COLS_STEP]], 2
208; CHECK-NEXT:    br i1 [[COLS_COND]], label [[COLS_HEADER]], label [[CONTINUE:%.*]]
209; CHECK:       continue:
210; CHECK-NEXT:    ret void
211;
212
213
214entry:
215  %a = load volatile <4 x double>, <4 x double>* %A, align 8
216  %b = load <4 x double>, <4 x double>* %B, align 8
217
218  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
219
220  store <4 x double> %c, <4 x double>* %C, align 8
221  ret void
222}
223
224define void @multiply_load1_volatile(<4 x double>* noalias %A, <4 x double>* noalias %B, <4 x double>* noalias %C) {
225; CHECK-LABEL: @multiply_load1_volatile(
226; CHECK-NEXT:  entry:
227; CHECK-NEXT:    br label [[COLS_HEADER:%.*]]
228; CHECK:       cols.header:
229; CHECK-NEXT:    [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ]
230; CHECK-NEXT:    br label [[COLS_BODY:%.*]]
231; CHECK:       cols.body:
232; CHECK-NEXT:    br label [[ROWS_HEADER:%.*]]
233; CHECK:       rows.header:
234; CHECK-NEXT:    [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ]
235; CHECK-NEXT:    br label [[ROWS_BODY:%.*]]
236; CHECK:       rows.body:
237; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
238; CHECK:       inner.header:
239; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
240; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP17:%.*]], [[INNER_LATCH]] ]
241; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP23:%.*]], [[INNER_LATCH]] ]
242; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
243; CHECK:       inner.body:
244; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[INNER_IV]], 2
245; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[ROWS_IV]]
246; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double>* [[A:%.*]] to double*
247; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, double* [[TMP4]], i64 [[TMP3]]
248; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP5]] to <4 x double>*
249; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double>* [[COL_CAST]] to double*
250; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
251; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
252; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP6]], i64 2
253; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
254; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
255; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[COLS_IV]], 2
256; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], [[INNER_IV]]
257; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x double>* [[B:%.*]] to double*
258; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr double, double* [[TMP9]], i64 [[TMP8]]
259; CHECK-NEXT:    [[COL_CAST3:%.*]] = bitcast double* [[TMP10]] to <4 x double>*
260; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x double>* [[COL_CAST3]] to double*
261; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[TMP11]] to <2 x double>*
262; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST4]], align 8
263; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP11]], i64 2
264; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>*
265; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8
266; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
267; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
268; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 0
269; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> undef, double [[TMP12]], i32 0
270; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> undef, <2 x i32> zeroinitializer
271; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
272; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
273; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 1
274; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> undef, double [[TMP14]], i32 0
275; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> undef, <2 x i32> zeroinitializer
276; CHECK-NEXT:    [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP13]])
277; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
278; CHECK-NEXT:    [[TMP17]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 3>
279; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
280; CHECK-NEXT:    [[BLOCK14:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
281; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 0
282; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <2 x double> undef, double [[TMP18]], i32 0
283; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT15]], <2 x double> undef, <2 x i32> zeroinitializer
284; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
285; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
286; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 1
287; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <2 x double> undef, double [[TMP20]], i32 0
288; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT18]], <2 x double> undef, <2 x i32> zeroinitializer
289; CHECK-NEXT:    [[TMP21:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP19]])
290; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP21]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
291; CHECK-NEXT:    [[TMP23]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP22]], <2 x i32> <i32 2, i32 3>
292; CHECK-NEXT:    br label [[INNER_LATCH]]
293; CHECK:       inner.latch:
294; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
295; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
296; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop !3
297; CHECK:       rows.latch:
298; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
299; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
300; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[COLS_IV]], 2
301; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], [[ROWS_IV]]
302; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x double>* [[C:%.*]] to double*
303; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, double* [[TMP26]], i64 [[TMP25]]
304; CHECK-NEXT:    [[COL_CAST20:%.*]] = bitcast double* [[TMP27]] to <4 x double>*
305; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x double>* [[COL_CAST20]] to double*
306; CHECK-NEXT:    [[VEC_CAST21:%.*]] = bitcast double* [[TMP28]] to <2 x double>*
307; CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[VEC_CAST21]], align 8
308; CHECK-NEXT:    [[VEC_GEP22:%.*]] = getelementptr double, double* [[TMP28]], i64 2
309; CHECK-NEXT:    [[VEC_CAST23:%.*]] = bitcast double* [[VEC_GEP22]] to <2 x double>*
310; CHECK-NEXT:    store <2 x double> [[TMP23]], <2 x double>* [[VEC_CAST23]], align 8
311; CHECK-NEXT:    br i1 [[ROWS_COND]], label [[ROWS_HEADER]], label [[COLS_LATCH]]
312; CHECK:       cols.latch:
313; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
314; CHECK-NEXT:    [[COLS_COND:%.*]] = icmp ne i64 [[COLS_STEP]], 2
315; CHECK-NEXT:    br i1 [[COLS_COND]], label [[COLS_HEADER]], label [[CONTINUE:%.*]]
316; CHECK:       continue:
317; CHECK-NEXT:    ret void
318;
319
320
321entry:
322  %a = load <4 x double>, <4 x double>* %A, align 8
323  %b = load volatile <4 x double>, <4 x double>* %B, align 8
324
325  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
326
327  store <4 x double> %c, <4 x double>* %C, align 8
328  ret void
329}
330
331define void @multiply_store_volatile(<4 x double>* noalias %A, <4 x double>* noalias %B, <4 x double>* noalias %C) {
332; CHECK-LABEL: @multiply_store_volatile(
333; CHECK-NEXT:  entry:
334; CHECK-NEXT:    br label [[COLS_HEADER:%.*]]
335; CHECK:       cols.header:
336; CHECK-NEXT:    [[COLS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[COLS_STEP:%.*]], [[COLS_LATCH:%.*]] ]
337; CHECK-NEXT:    br label [[COLS_BODY:%.*]]
338; CHECK:       cols.body:
339; CHECK-NEXT:    br label [[ROWS_HEADER:%.*]]
340; CHECK:       rows.header:
341; CHECK-NEXT:    [[ROWS_IV:%.*]] = phi i64 [ 0, [[COLS_BODY]] ], [ [[ROWS_STEP:%.*]], [[ROWS_LATCH:%.*]] ]
342; CHECK-NEXT:    br label [[ROWS_BODY:%.*]]
343; CHECK:       rows.body:
344; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
345; CHECK:       inner.header:
346; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[ROWS_BODY]] ], [ [[INNER_STEP:%.*]], [[INNER_LATCH:%.*]] ]
347; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP17:%.*]], [[INNER_LATCH]] ]
348; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ zeroinitializer, [[ROWS_BODY]] ], [ [[TMP23:%.*]], [[INNER_LATCH]] ]
349; CHECK-NEXT:    br label [[INNER_BODY:%.*]]
350; CHECK:       inner.body:
351; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[INNER_IV]], 2
352; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[ROWS_IV]]
353; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double>* [[A:%.*]] to double*
354; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, double* [[TMP4]], i64 [[TMP3]]
355; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP5]] to <4 x double>*
356; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double>* [[COL_CAST]] to double*
357; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
358; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
359; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP6]], i64 2
360; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
361; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
362; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[COLS_IV]], 2
363; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], [[INNER_IV]]
364; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x double>* [[B:%.*]] to double*
365; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr double, double* [[TMP9]], i64 [[TMP8]]
366; CHECK-NEXT:    [[COL_CAST3:%.*]] = bitcast double* [[TMP10]] to <4 x double>*
367; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x double>* [[COL_CAST3]] to double*
368; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[TMP11]] to <2 x double>*
369; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST4]], align 8
370; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* [[TMP11]], i64 2
371; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <2 x double>*
372; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST7]], align 8
373; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
374; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
375; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 0
376; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x double> undef, double [[TMP12]], i32 0
377; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT]], <2 x double> undef, <2 x i32> zeroinitializer
378; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK9]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[BLOCK]])
379; CHECK-NEXT:    [[BLOCK10:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
380; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD5]], i64 1
381; CHECK-NEXT:    [[SPLAT_SPLATINSERT11:%.*]] = insertelement <2 x double> undef, double [[TMP14]], i32 0
382; CHECK-NEXT:    [[SPLAT_SPLAT12:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT11]], <2 x double> undef, <2 x i32> zeroinitializer
383; CHECK-NEXT:    [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK10]], <2 x double> [[SPLAT_SPLAT12]], <2 x double> [[TMP13]])
384; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
385; CHECK-NEXT:    [[TMP17]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP16]], <2 x i32> <i32 2, i32 3>
386; CHECK-NEXT:    [[BLOCK13:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
387; CHECK-NEXT:    [[BLOCK14:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
388; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 0
389; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <2 x double> undef, double [[TMP18]], i32 0
390; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT15]], <2 x double> undef, <2 x i32> zeroinitializer
391; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK14]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[BLOCK13]])
392; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
393; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD8]], i64 1
394; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <2 x double> undef, double [[TMP20]], i32 0
395; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[SPLAT_SPLATINSERT18]], <2 x double> undef, <2 x i32> zeroinitializer
396; CHECK-NEXT:    [[TMP21:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[BLOCK17]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP19]])
397; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP21]], <2 x double> undef, <2 x i32> <i32 0, i32 1>
398; CHECK-NEXT:    [[TMP23]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP22]], <2 x i32> <i32 2, i32 3>
399; CHECK-NEXT:    br label [[INNER_LATCH]]
400; CHECK:       inner.latch:
401; CHECK-NEXT:    [[INNER_STEP]] = add i64 [[INNER_IV]], 2
402; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ne i64 [[INNER_STEP]], 2
403; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[ROWS_LATCH]], !llvm.loop !4
404; CHECK:       rows.latch:
405; CHECK-NEXT:    [[ROWS_STEP]] = add i64 [[ROWS_IV]], 2
406; CHECK-NEXT:    [[ROWS_COND:%.*]] = icmp ne i64 [[ROWS_STEP]], 2
407; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[COLS_IV]], 2
408; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], [[ROWS_IV]]
409; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x double>* [[C:%.*]] to double*
410; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, double* [[TMP26]], i64 [[TMP25]]
411; CHECK-NEXT:    [[COL_CAST20:%.*]] = bitcast double* [[TMP27]] to <4 x double>*
412; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x double>* [[COL_CAST20]] to double*
413; CHECK-NEXT:    [[VEC_CAST21:%.*]] = bitcast double* [[TMP28]] to <2 x double>*
414; CHECK-NEXT:    store volatile <2 x double> [[TMP17]], <2 x double>* [[VEC_CAST21]], align 8
415; CHECK-NEXT:    [[VEC_GEP22:%.*]] = getelementptr double, double* [[TMP28]], i64 2
416; CHECK-NEXT:    [[VEC_CAST23:%.*]] = bitcast double* [[VEC_GEP22]] to <2 x double>*
417; CHECK-NEXT:    store volatile <2 x double> [[TMP23]], <2 x double>* [[VEC_CAST23]], align 8
418; CHECK-NEXT:    br i1 [[ROWS_COND]], label [[ROWS_HEADER]], label [[COLS_LATCH]]
419; CHECK:       cols.latch:
420; CHECK-NEXT:    [[COLS_STEP]] = add i64 [[COLS_IV]], 2
421; CHECK-NEXT:    [[COLS_COND:%.*]] = icmp ne i64 [[COLS_STEP]], 2
422; CHECK-NEXT:    br i1 [[COLS_COND]], label [[COLS_HEADER]], label [[CONTINUE:%.*]]
423; CHECK:       continue:
424; CHECK-NEXT:    ret void
425;
426
427entry:
428  %a = load <4 x double>, <4 x double>* %A, align 8
429  %b = load <4 x double>, <4 x double>* %B, align 8
430
431  %c = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
432
433  store volatile <4 x double> %c, <4 x double>* %C, align 8
434  ret void
435}
436
437declare <4 x double> @llvm.matrix.multiply(<4 x double>, <4 x double>, i32, i32, i32)
438