• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// RUN: hlo_to_llvm_ir --ptx --sm=70 --xla_disable_all_hlo_passes=true %s | FileCheck %s
2// RUN: hlo_to_llvm_ir --xla_disable_all_hlo_passes=true %s | FileCheck --check-prefix=CHECK-LLVM %s
3// We check that the row loads are vectorized.
4
5HloModule SimpleAddRowBroadcasting
6
7%fused_computation.0 (param_0: f32[672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672]{
8  %param_0 = f32[672]{0} parameter(0)
9  %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={3}
10  %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1)
11  ROOT %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %param_1)
12}
13
14ENTRY main {
15  %param_0 = f32[672]{0} parameter(0)
16  %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1)
17
18  ROOT %fusion.0 = f32[512,14,14,672]{3,2,1,0} fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.0
19}
20
21// CHECK-LABEL: fusion_0
22// CHECK: .reqntid 168, 1, 1
23// CHECK-NOT: ld.global.nc.f
24// CHECK-NOT: ld.global.nc.b
25
26// -----
27
28HloModule SimpleAddSmallRowBroadcasting
29
30%fused_computation.0 (param_0: f32[48], param_1: f32[512,14,14,48]) -> f32[512,14,14,48]{
31  %param_0 = f32[48]{0} parameter(0)
32  %broadcast = f32[512,14,14,48]{3,2,1,0} broadcast(%param_0), dimensions={3}
33  %param_1 = f32[512,14,14,48]{3,2,1,0} parameter(1)
34  ROOT %add = f32[512,14,14,48]{3,2,1,0} add(%broadcast, %param_1)
35}
36
37ENTRY main {
38  %param_0 = f32[48]{0} parameter(0)
39  %param_1 = f32[512,14,14,48]{3,2,1,0} parameter(1)
40
41  ROOT %fusion.0_small = f32[512,14,14,48]{3,2,1,0} fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.0
42}
43
44// CHECK-LABEL: fusion_0_small
45// CHECK: .reqntid 12, 11, 1
46// CHECK-NOT: ld.global.nc.f
47// CHECK-NOT: ld.global.nc.b
48
49// -----
50
51// This test an BatchNorm fused kernel found in EfficientNet.
52HloModule EfficientNetSwish
53
54%fused_computation.1 (param_0.89: f32[672], param_1: f32[672], param_2: f32[672], param_3: f32[672], param_4: f16[512,14,14,672], param_5: f32[672], param_6: f16[512,14,14,672], param_7: f32[672]) -> f16[512,14,14,672] {
55  %param_2 = f32[672]{0} parameter(2)
56  %constant_157 = f32[] constant(1), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
57  %broadcast.186 = f32[672]{0} broadcast(f32[] %constant_157), dimensions={}, metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
58  %param_5 = f32[672]{0} parameter(5)
59  %constant_56 = f32[] constant(9.96492327e-06)
60  %broadcast.185 = f32[672]{0} broadcast(f32[] %constant_56), dimensions={}
61  %multiply.155 = f32[672]{0} multiply(f32[672]{0} %param_5, f32[672]{0} %broadcast.185), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
62  %param_3 = f32[672]{0} parameter(3)
63  %multiply.154 = f32[672]{0} multiply(f32[672]{0} %param_3, f32[672]{0} %broadcast.185), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
64  %multiply.153 = f32[672]{0} multiply(f32[672]{0} %multiply.154, f32[672]{0} %multiply.154), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
65  %subtract.15 = f32[672]{0} subtract(f32[672]{0} %multiply.155, f32[672]{0} %multiply.153), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
66  %constant_155 = f32[] constant(0.001), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
67  %broadcast.184 = f32[672]{0} broadcast(f32[] %constant_155), dimensions={}
68  %add.14 = f32[672]{0} add(f32[672]{0} %subtract.15, f32[672]{0} %broadcast.184), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
69  %rsqrt.23 = f32[672]{0} rsqrt(f32[672]{0} %add.14), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
70  %multiply.152 = f32[672]{0} multiply(f32[672]{0} %rsqrt.23, f32[672]{0} %rsqrt.23), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
71  %divide.14 = f32[672]{0} divide(f32[672]{0} %broadcast.186, f32[672]{0} %multiply.152), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
72  %rsqrt.7 = f32[672]{0} rsqrt(f32[672]{0} %divide.14), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
73  %multiply.29 = f32[672]{0} multiply(f32[672]{0} %param_2, f32[672]{0} %rsqrt.7), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
74  %multiply.28 = f32[672]{0} multiply(f32[672]{0} %multiply.29, f32[672]{0} %broadcast.185), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
75  %broadcast.47 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %multiply.28), dimensions={3}
76  %param_6 = f16[512,14,14,672]{3,2,1,0} parameter(6)
77  %constant_194 = f16[] constant(1), metadata={op_type="AddV2" op_name="add"}
78  %broadcast.256 = f16[512,14,14,672]{3,2,1,0} broadcast(f16[] %constant_194), dimensions={}
79  %param_4 = f16[512,14,14,672]{3,2,1,0} parameter(4)
80  %convert.66 = f32[512,14,14,672]{3,2,1,0} convert(f16[512,14,14,672]{3,2,1,0} %param_4), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
81  %broadcast.254 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %multiply.154), dimensions={3}, metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
82  %subtract.82 = f32[512,14,14,672]{3,2,1,0} subtract(f32[512,14,14,672]{3,2,1,0} %convert.66, f32[512,14,14,672]{3,2,1,0} %broadcast.254), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
83  %broadcast.251 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %rsqrt.23), dimensions={3}
84  %multiply.219 = f32[512,14,14,672]{3,2,1,0} multiply(f32[512,14,14,672]{3,2,1,0} %subtract.82, f32[512,14,14,672]{3,2,1,0} %broadcast.251), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
85  %broadcast.250 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %param_2), dimensions={3}, metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
86  %multiply.218 = f32[512,14,14,672]{3,2,1,0} multiply(f32[512,14,14,672]{3,2,1,0} %multiply.219, f32[512,14,14,672]{3,2,1,0} %broadcast.250), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
87  %param_7 = f32[672]{0} parameter(7)
88  %broadcast.249 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %param_7), dimensions={3}, metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
89  %add.79 = f32[512,14,14,672]{3,2,1,0} add(f32[512,14,14,672]{3,2,1,0} %multiply.218, f32[512,14,14,672]{3,2,1,0} %broadcast.249), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
90  %convert.65 = f16[512,14,14,672]{3,2,1,0} convert(f32[512,14,14,672]{3,2,1,0} %add.79), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"}
91  %negate.12 = f16[512,14,14,672]{3,2,1,0} negate(f16[512,14,14,672]{3,2,1,0} %convert.65)
92  %exponential.10 = f16[512,14,14,672]{3,2,1,0} exponential(f16[512,14,14,672]{3,2,1,0} %negate.12)
93  %add.78 = f16[512,14,14,672]{3,2,1,0} add(f16[512,14,14,672]{3,2,1,0} %broadcast.256, f16[512,14,14,672]{3,2,1,0} %exponential.10)
94  %divide.20 = f16[512,14,14,672]{3,2,1,0} divide(f16[512,14,14,672]{3,2,1,0} %broadcast.256, f16[512,14,14,672]{3,2,1,0} %add.78), metadata={op_type="Sigmoid" op_name="foo/activation/Sigmoid"}
95  %subtract.77 = f16[512,14,14,672]{3,2,1,0} subtract(f16[512,14,14,672]{3,2,1,0} %broadcast.256, f16[512,14,14,672]{3,2,1,0} %divide.20), metadata={op_type="Sub" op_name="sub"}
96  %multiply.211 = f16[512,14,14,672]{3,2,1,0} multiply(f16[512,14,14,672]{3,2,1,0} %convert.65, f16[512,14,14,672]{3,2,1,0} %subtract.77), metadata={op_type="Mul" op_name="mul"}
97  %add.75 = f16[512,14,14,672]{3,2,1,0} add(f16[512,14,14,672]{3,2,1,0} %broadcast.256, f16[512,14,14,672]{3,2,1,0} %multiply.211), metadata={op_type="AddV2" op_name="add"}
98  %multiply.210 = f16[512,14,14,672]{3,2,1,0} multiply(f16[512,14,14,672]{3,2,1,0} %divide.20, f16[512,14,14,672]{3,2,1,0} %add.75), metadata={op_type="Mul" op_name="mul_1"}
99  %multiply.209 = f16[512,14,14,672]{3,2,1,0} multiply(f16[512,14,14,672]{3,2,1,0} %param_6, f16[512,14,14,672]{3,2,1,0} %multiply.210), metadata={op_type="Mul" op_name="mul_2"}
100  %convert.8 = f32[512,14,14,672]{3,2,1,0} convert(f16[512,14,14,672]{3,2,1,0} %multiply.209), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
101  %constant_48 = f32[] constant(100352), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
102  %broadcast.46 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[] %constant_48), dimensions={}, metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
103  %multiply.27 = f32[512,14,14,672]{3,2,1,0} multiply(f32[512,14,14,672]{3,2,1,0} %convert.8, f32[512,14,14,672]{3,2,1,0} %broadcast.46), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
104  %param_1 = f32[672]{0} parameter(1)
105  %broadcast.45 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %param_1), dimensions={3}, metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
106  %subtract.10 = f32[512,14,14,672]{3,2,1,0} subtract(f32[512,14,14,672]{3,2,1,0} %multiply.27, f32[512,14,14,672]{3,2,1,0} %broadcast.45), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
107  %param_0.89 = f32[672]{0} parameter(0)
108  %broadcast.44 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %param_0.89), dimensions={3}, metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
109  %multiply.26 = f32[512,14,14,672]{3,2,1,0} multiply(f32[512,14,14,672]{3,2,1,0} %broadcast.44, f32[512,14,14,672]{3,2,1,0} %subtract.82), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
110  %broadcast.42 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %divide.14), dimensions={3}
111  %divide.6 = f32[512,14,14,672]{3,2,1,0} divide(f32[512,14,14,672]{3,2,1,0} %multiply.26, f32[512,14,14,672]{3,2,1,0} %broadcast.42), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
112  %subtract.9 = f32[512,14,14,672]{3,2,1,0} subtract(f32[512,14,14,672]{3,2,1,0} %subtract.10, f32[512,14,14,672]{3,2,1,0} %divide.6), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
113  %multiply.25 = f32[512,14,14,672]{3,2,1,0} multiply(f32[512,14,14,672]{3,2,1,0} %broadcast.47, f32[512,14,14,672]{3,2,1,0} %subtract.9), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
114  ROOT %convert.7 = f16[512,14,14,672]{3,2,1,0} convert(f32[512,14,14,672]{3,2,1,0} %multiply.25), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
115}
116
117ENTRY main {
118  %param_0 = f32[672]{0} parameter(0)
119  %param_1 = f32[672]{0} parameter(1)
120  %param_2 = f32[672]{0} parameter(2)
121  %param_3 = f32[672]{0} parameter(3)
122  %param_4 = f16[512,14,14,672]{3,2,1,0} parameter(4)
123  %param_5 = f32[672]{0} parameter(5)
124  %param_6 = f16[512,14,14,672]{3,2,1,0} parameter(6)
125  %param_7 = f32[672]{0} parameter(7)
126
127  ROOT %fusion.1 = f16[512,14,14,672]{3,2,1,0} fusion(f32[672]{0} %param_0, f32[672]{0} %param_1, f32[672]{0} %param_2, f32[672]{0} %param_3, f16[512,14,14,672]{3,2,1,0} %param_4, f32[672]{0} %param_5, f16[512,14,14,672]{3,2,1,0} %param_6, f32[672]{0} %param_7), kind=kLoop, calls=%fused_computation.1, metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"}
128}
129
130// CHECK-LABEL: fusion_1
131// CHECK: .reqntid 168, 1, 1
132// CHECK-NOT: ld.global.nc.f
133// CHECK-NOT: ld.global.nc.b
134
135// -----
136
137HloModule TransposeOutput
138
139%fused_computation.2 (param_0: f32[672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] {
140  %param_0 = f32[672]{0} parameter(0)
141  %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={3}
142  %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1)
143  %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %param_1)
144  ROOT  %copy = f32[512,14,14,672]{0,2,3,1} copy(%add)
145}
146
147ENTRY main {
148  %param_0 = f32[672]{0} parameter(0)
149  %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1)
150
151  ROOT %fusion.2 = f32[512,14,14,672]{0,2,3,1} fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.2
152}
153// Check that we didn't do anything. The block size didn't change.
154// CHECK-LABEL: fusion_2
155// CHECK: .reqntid 256, 1, 1
156// CHECK: ld.global.nc.f
157
158// -----
159
160HloModule TransposeInput
161
162%fused_computation.3 (param_0: f32[672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] {
163  %param_0 = f32[672]{0} parameter(0)
164  %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={3}
165  %param_1 = f32[512,14,14,672]{0,2,3,1} parameter(1)
166  %copy = f32[512,14,14,672]{3,2,1,0} copy(%param_1)
167  ROOT %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %copy)
168}
169
170ENTRY main {
171  %param_0 = f32[672]{0} parameter(0)
172  %param_1 = f32[512,14,14,672]{0,2,3,1} parameter(1)
173
174  ROOT %fusion.3 = f32[512,14,14,672]{3,2,1,0} fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.3
175}
176// Check that we didn't do anything. The block size didn't change.
177// CHECK-LABEL: fusion_3
178// CHECK: .reqntid 256, 1, 1
179// CHECK: ld.global.nc.f
180
181// -----
182
183HloModule MOF
184
185%fused_computation.4 (param_0: f32[672], param_1: f32[512,14,14,672]) -> (f32[512,14,14,672], f32[512,14,14,672]) {
186  %param_0 = f32[672]{0} parameter(0)
187  %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={3}
188  %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1)
189  %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %param_1)
190  ROOT tupe = (f32[512,14,14,672]{3,2,1,0}, f32[512,14,14,672]{3,2,1,0}) tuple(%add, %add)
191}
192
193ENTRY main {
194  %param_0 = f32[672]{0} parameter(0)
195  %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1)
196
197  ROOT %fusion.4 = (f32[512,14,14,672]{3,2,1,0}, f32[512,14,14,672]) fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.4
198}
199
200// Check that we didn't do anything. The block size didn't change.
201// CHECK-LABEL: fusion_4
202// CHECK: .reqntid 256, 1, 1
203// CHECK: ld.global.nc.f
204
205// -----
206
207HloModule ScalarBroadcasting
208
209%fused_computation.5 (param_0: f32[], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] {
210  %param_0 = f32[] parameter(0)
211  %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={}
212  %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1)
213  ROOT %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %param_1)
214}
215
216ENTRY main {
217  %param_0 = f32[] parameter(0)
218  %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1)
219
220  ROOT %fusion.5 = f32[512,14,14,672] fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.5
221}
222
223// CHECK-LABEL: fusion_5
224// CHECK: .reqntid 128, 1, 1
225// CHECK: ld.global.nc.f
226
227// -----
228
229HloModule NotSupportedBroadcasting
230
231%fused_computation.6 (param_0: f32[14,672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] {
232  %param_0 = f32[14,672]{1,0} parameter(0)
233  %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={2,3}
234  %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1)
235  ROOT %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %param_1)
236}
237
238ENTRY main {
239  %param_0 = f32[14,672]{1,0} parameter(0)
240  %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1)
241
242  ROOT %fusion.6 = f32[512,14,14,672] fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.6
243}
244
245// Check that we didn't do anything. The block size didn't change.
246// CHECK-LABEL: fusion_6
247// CHECK: .reqntid 256, 1, 1
248// CHECK: ld.global.nc.f
249
250// -----
251HloModule Module
252
253%fused_computation.7 {
254  %constant_2 = f32[] constant(0)
255  %broadcast.1 = f32[32,7,7,352]{2,1,3,0} broadcast(f32[] %constant_2), dimensions={}
256  %param_1.2 = f32[32,7,7,320]{2,1,3,0} parameter(1)
257  %param_2.1 = f32[32,7,7,224]{2,1,3,0} parameter(2)
258  %param_3.1 = f32[32,7,7,128]{2,1,3,0} parameter(3)
259  %tmp_8.1 = f32[32,7,7,1024]{2,1,3,0} concatenate(f32[32,7,7,352]{2,1,3,0} %broadcast.1, f32[32,7,7,320]{2,1,3,0} %param_1.2, f32[32,7,7,224]{2,1,3,0} %param_2.1, f32[32,7,7,128]{2,1,3,0} %param_3.1), dimensions={3}
260  %param_0.1 = f32[32,7,7,1024]{2,1,3,0} parameter(0)
261  ROOT %tmp_10.1 = f32[32,7,7,1024]{2,1,3,0} add(f32[32,7,7,1024]{2,1,3,0} %tmp_8.1, f32[32,7,7,1024]{2,1,3,0} %param_0.1)
262}
263
264ENTRY %computation {
265  %tmp_0 = u8[32,224,224,3]{3,2,1,0} parameter(0)
266  %tmp_9 = f32[32,7,7,1024]{2,1,3,0} constant({...})
267  %tmp_5 = f32[32,7,7,320]{2,1,3,0} constant({...})
268  %tmp_6 = f32[32,7,7,224]{2,1,3,0} constant({...})
269  %tmp_7 = f32[32,7,7,128]{2,1,3,0} constant({...})
270  ROOT %fusion.7 = f32[32,7,7,1024]{2,1,3,0} fusion(f32[32,7,7,1024]{2,1,3,0} %tmp_9, f32[32,7,7,320]{2,1,3,0} %tmp_5, f32[32,7,7,224]{2,1,3,0} %tmp_6, f32[32,7,7,128]{2,1,3,0} %tmp_7), kind=kLoop, calls=%fused_computation.7
271}
272
273
274// This graph triggered a bug where the new indexing was generated
275// CHECK-LLVM-LABEL: @fusion_7
276// CHECK-LLVM-NOT: row_index
277
278// -----
279HloModule RowToLong
280
281ENTRY main {
282  %param_0 = f32[2025]{0} parameter(0)
283  ROOT %broadcastRowToLong = f32[3025,2025]{1,0} broadcast(%param_0), dimensions={1}
284}
285// Check that we didn't emit the simpler row broadcasting.
286// CHECK-LLVM-LABEL: @broadcastRowToLong
287// CHECK-LLVM-NOT: row_index
288
289// -----
290
291HloModule module
292
293ENTRY computation {
294  p0 = f16[5000,64,64,32] parameter(0)
295  zero = f16[] constant(0)
296
297  ROOT pad1 = f16[5000,65,65,32] pad(p0, zero), padding=0_0x0_1x0_1x0_0
298}
299
300// Check that we emit vectorized read.
301// CHECK: ld.global.nc.v4.f32
302