1// RUN: hlo_to_llvm_ir --ptx --sm=70 --xla_disable_all_hlo_passes=true %s | FileCheck %s 2// RUN: hlo_to_llvm_ir --xla_disable_all_hlo_passes=true %s | FileCheck --check-prefix=CHECK-LLVM %s 3// We check that the row loads are vectorized. 4 5HloModule SimpleAddRowBroadcasting 6 7%fused_computation.0 (param_0: f32[672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672]{ 8 %param_0 = f32[672]{0} parameter(0) 9 %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={3} 10 %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1) 11 ROOT %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %param_1) 12} 13 14ENTRY main { 15 %param_0 = f32[672]{0} parameter(0) 16 %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1) 17 18 ROOT %fusion.0 = f32[512,14,14,672]{3,2,1,0} fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.0 19} 20 21// CHECK-LABEL: fusion_0 22// CHECK: .reqntid 168, 1, 1 23// CHECK-NOT: ld.global.nc.f 24// CHECK-NOT: ld.global.nc.b 25 26// ----- 27 28HloModule SimpleAddSmallRowBroadcasting 29 30%fused_computation.0 (param_0: f32[48], param_1: f32[512,14,14,48]) -> f32[512,14,14,48]{ 31 %param_0 = f32[48]{0} parameter(0) 32 %broadcast = f32[512,14,14,48]{3,2,1,0} broadcast(%param_0), dimensions={3} 33 %param_1 = f32[512,14,14,48]{3,2,1,0} parameter(1) 34 ROOT %add = f32[512,14,14,48]{3,2,1,0} add(%broadcast, %param_1) 35} 36 37ENTRY main { 38 %param_0 = f32[48]{0} parameter(0) 39 %param_1 = f32[512,14,14,48]{3,2,1,0} parameter(1) 40 41 ROOT %fusion.0_small = f32[512,14,14,48]{3,2,1,0} fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.0 42} 43 44// CHECK-LABEL: fusion_0_small 45// CHECK: .reqntid 12, 11, 1 46// CHECK-NOT: ld.global.nc.f 47// CHECK-NOT: ld.global.nc.b 48 49// ----- 50 51// This test an BatchNorm fused kernel found in EfficientNet. 52HloModule EfficientNetSwish 53 54%fused_computation.1 (param_0.89: f32[672], param_1: f32[672], param_2: f32[672], param_3: f32[672], param_4: f16[512,14,14,672], param_5: f32[672], param_6: f16[512,14,14,672], param_7: f32[672]) -> f16[512,14,14,672] { 55 %param_2 = f32[672]{0} parameter(2) 56 %constant_157 = f32[] constant(1), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 57 %broadcast.186 = f32[672]{0} broadcast(f32[] %constant_157), dimensions={}, metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 58 %param_5 = f32[672]{0} parameter(5) 59 %constant_56 = f32[] constant(9.96492327e-06) 60 %broadcast.185 = f32[672]{0} broadcast(f32[] %constant_56), dimensions={} 61 %multiply.155 = f32[672]{0} multiply(f32[672]{0} %param_5, f32[672]{0} %broadcast.185), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 62 %param_3 = f32[672]{0} parameter(3) 63 %multiply.154 = f32[672]{0} multiply(f32[672]{0} %param_3, f32[672]{0} %broadcast.185), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 64 %multiply.153 = f32[672]{0} multiply(f32[672]{0} %multiply.154, f32[672]{0} %multiply.154), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 65 %subtract.15 = f32[672]{0} subtract(f32[672]{0} %multiply.155, f32[672]{0} %multiply.153), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 66 %constant_155 = f32[] constant(0.001), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 67 %broadcast.184 = f32[672]{0} broadcast(f32[] %constant_155), dimensions={} 68 %add.14 = f32[672]{0} add(f32[672]{0} %subtract.15, f32[672]{0} %broadcast.184), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 69 %rsqrt.23 = f32[672]{0} rsqrt(f32[672]{0} %add.14), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 70 %multiply.152 = f32[672]{0} multiply(f32[672]{0} %rsqrt.23, f32[672]{0} %rsqrt.23), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 71 %divide.14 = f32[672]{0} divide(f32[672]{0} %broadcast.186, f32[672]{0} %multiply.152), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 72 %rsqrt.7 = f32[672]{0} rsqrt(f32[672]{0} %divide.14), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 73 %multiply.29 = f32[672]{0} multiply(f32[672]{0} %param_2, f32[672]{0} %rsqrt.7), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 74 %multiply.28 = f32[672]{0} multiply(f32[672]{0} %multiply.29, f32[672]{0} %broadcast.185), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 75 %broadcast.47 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %multiply.28), dimensions={3} 76 %param_6 = f16[512,14,14,672]{3,2,1,0} parameter(6) 77 %constant_194 = f16[] constant(1), metadata={op_type="AddV2" op_name="add"} 78 %broadcast.256 = f16[512,14,14,672]{3,2,1,0} broadcast(f16[] %constant_194), dimensions={} 79 %param_4 = f16[512,14,14,672]{3,2,1,0} parameter(4) 80 %convert.66 = f32[512,14,14,672]{3,2,1,0} convert(f16[512,14,14,672]{3,2,1,0} %param_4), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 81 %broadcast.254 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %multiply.154), dimensions={3}, metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 82 %subtract.82 = f32[512,14,14,672]{3,2,1,0} subtract(f32[512,14,14,672]{3,2,1,0} %convert.66, f32[512,14,14,672]{3,2,1,0} %broadcast.254), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 83 %broadcast.251 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %rsqrt.23), dimensions={3} 84 %multiply.219 = f32[512,14,14,672]{3,2,1,0} multiply(f32[512,14,14,672]{3,2,1,0} %subtract.82, f32[512,14,14,672]{3,2,1,0} %broadcast.251), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 85 %broadcast.250 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %param_2), dimensions={3}, metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 86 %multiply.218 = f32[512,14,14,672]{3,2,1,0} multiply(f32[512,14,14,672]{3,2,1,0} %multiply.219, f32[512,14,14,672]{3,2,1,0} %broadcast.250), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 87 %param_7 = f32[672]{0} parameter(7) 88 %broadcast.249 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %param_7), dimensions={3}, metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 89 %add.79 = f32[512,14,14,672]{3,2,1,0} add(f32[512,14,14,672]{3,2,1,0} %multiply.218, f32[512,14,14,672]{3,2,1,0} %broadcast.249), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 90 %convert.65 = f16[512,14,14,672]{3,2,1,0} convert(f32[512,14,14,672]{3,2,1,0} %add.79), metadata={op_type="FusedBatchNormV3" op_name="foo/batch_normalization/FusedBatchNormV3"} 91 %negate.12 = f16[512,14,14,672]{3,2,1,0} negate(f16[512,14,14,672]{3,2,1,0} %convert.65) 92 %exponential.10 = f16[512,14,14,672]{3,2,1,0} exponential(f16[512,14,14,672]{3,2,1,0} %negate.12) 93 %add.78 = f16[512,14,14,672]{3,2,1,0} add(f16[512,14,14,672]{3,2,1,0} %broadcast.256, f16[512,14,14,672]{3,2,1,0} %exponential.10) 94 %divide.20 = f16[512,14,14,672]{3,2,1,0} divide(f16[512,14,14,672]{3,2,1,0} %broadcast.256, f16[512,14,14,672]{3,2,1,0} %add.78), metadata={op_type="Sigmoid" op_name="foo/activation/Sigmoid"} 95 %subtract.77 = f16[512,14,14,672]{3,2,1,0} subtract(f16[512,14,14,672]{3,2,1,0} %broadcast.256, f16[512,14,14,672]{3,2,1,0} %divide.20), metadata={op_type="Sub" op_name="sub"} 96 %multiply.211 = f16[512,14,14,672]{3,2,1,0} multiply(f16[512,14,14,672]{3,2,1,0} %convert.65, f16[512,14,14,672]{3,2,1,0} %subtract.77), metadata={op_type="Mul" op_name="mul"} 97 %add.75 = f16[512,14,14,672]{3,2,1,0} add(f16[512,14,14,672]{3,2,1,0} %broadcast.256, f16[512,14,14,672]{3,2,1,0} %multiply.211), metadata={op_type="AddV2" op_name="add"} 98 %multiply.210 = f16[512,14,14,672]{3,2,1,0} multiply(f16[512,14,14,672]{3,2,1,0} %divide.20, f16[512,14,14,672]{3,2,1,0} %add.75), metadata={op_type="Mul" op_name="mul_1"} 99 %multiply.209 = f16[512,14,14,672]{3,2,1,0} multiply(f16[512,14,14,672]{3,2,1,0} %param_6, f16[512,14,14,672]{3,2,1,0} %multiply.210), metadata={op_type="Mul" op_name="mul_2"} 100 %convert.8 = f32[512,14,14,672]{3,2,1,0} convert(f16[512,14,14,672]{3,2,1,0} %multiply.209), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 101 %constant_48 = f32[] constant(100352), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 102 %broadcast.46 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[] %constant_48), dimensions={}, metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 103 %multiply.27 = f32[512,14,14,672]{3,2,1,0} multiply(f32[512,14,14,672]{3,2,1,0} %convert.8, f32[512,14,14,672]{3,2,1,0} %broadcast.46), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 104 %param_1 = f32[672]{0} parameter(1) 105 %broadcast.45 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %param_1), dimensions={3}, metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 106 %subtract.10 = f32[512,14,14,672]{3,2,1,0} subtract(f32[512,14,14,672]{3,2,1,0} %multiply.27, f32[512,14,14,672]{3,2,1,0} %broadcast.45), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 107 %param_0.89 = f32[672]{0} parameter(0) 108 %broadcast.44 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %param_0.89), dimensions={3}, metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 109 %multiply.26 = f32[512,14,14,672]{3,2,1,0} multiply(f32[512,14,14,672]{3,2,1,0} %broadcast.44, f32[512,14,14,672]{3,2,1,0} %subtract.82), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 110 %broadcast.42 = f32[512,14,14,672]{3,2,1,0} broadcast(f32[672]{0} %divide.14), dimensions={3} 111 %divide.6 = f32[512,14,14,672]{3,2,1,0} divide(f32[512,14,14,672]{3,2,1,0} %multiply.26, f32[512,14,14,672]{3,2,1,0} %broadcast.42), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 112 %subtract.9 = f32[512,14,14,672]{3,2,1,0} subtract(f32[512,14,14,672]{3,2,1,0} %subtract.10, f32[512,14,14,672]{3,2,1,0} %divide.6), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 113 %multiply.25 = f32[512,14,14,672]{3,2,1,0} multiply(f32[512,14,14,672]{3,2,1,0} %broadcast.47, f32[512,14,14,672]{3,2,1,0} %subtract.9), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 114 ROOT %convert.7 = f16[512,14,14,672]{3,2,1,0} convert(f32[512,14,14,672]{3,2,1,0} %multiply.25), metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 115} 116 117ENTRY main { 118 %param_0 = f32[672]{0} parameter(0) 119 %param_1 = f32[672]{0} parameter(1) 120 %param_2 = f32[672]{0} parameter(2) 121 %param_3 = f32[672]{0} parameter(3) 122 %param_4 = f16[512,14,14,672]{3,2,1,0} parameter(4) 123 %param_5 = f32[672]{0} parameter(5) 124 %param_6 = f16[512,14,14,672]{3,2,1,0} parameter(6) 125 %param_7 = f32[672]{0} parameter(7) 126 127 ROOT %fusion.1 = f16[512,14,14,672]{3,2,1,0} fusion(f32[672]{0} %param_0, f32[672]{0} %param_1, f32[672]{0} %param_2, f32[672]{0} %param_3, f16[512,14,14,672]{3,2,1,0} %param_4, f32[672]{0} %param_5, f16[512,14,14,672]{3,2,1,0} %param_6, f32[672]{0} %param_7), kind=kLoop, calls=%fused_computation.1, metadata={op_type="FusedBatchNormGradV3" op_name="gradient_tape/foo/batch_normalization/FusedBatchNormGradV3"} 128} 129 130// CHECK-LABEL: fusion_1 131// CHECK: .reqntid 168, 1, 1 132// CHECK-NOT: ld.global.nc.f 133// CHECK-NOT: ld.global.nc.b 134 135// ----- 136 137HloModule TransposeOutput 138 139%fused_computation.2 (param_0: f32[672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] { 140 %param_0 = f32[672]{0} parameter(0) 141 %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={3} 142 %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1) 143 %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %param_1) 144 ROOT %copy = f32[512,14,14,672]{0,2,3,1} copy(%add) 145} 146 147ENTRY main { 148 %param_0 = f32[672]{0} parameter(0) 149 %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1) 150 151 ROOT %fusion.2 = f32[512,14,14,672]{0,2,3,1} fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.2 152} 153// Check that we didn't do anything. The block size didn't change. 154// CHECK-LABEL: fusion_2 155// CHECK: .reqntid 256, 1, 1 156// CHECK: ld.global.nc.f 157 158// ----- 159 160HloModule TransposeInput 161 162%fused_computation.3 (param_0: f32[672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] { 163 %param_0 = f32[672]{0} parameter(0) 164 %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={3} 165 %param_1 = f32[512,14,14,672]{0,2,3,1} parameter(1) 166 %copy = f32[512,14,14,672]{3,2,1,0} copy(%param_1) 167 ROOT %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %copy) 168} 169 170ENTRY main { 171 %param_0 = f32[672]{0} parameter(0) 172 %param_1 = f32[512,14,14,672]{0,2,3,1} parameter(1) 173 174 ROOT %fusion.3 = f32[512,14,14,672]{3,2,1,0} fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.3 175} 176// Check that we didn't do anything. The block size didn't change. 177// CHECK-LABEL: fusion_3 178// CHECK: .reqntid 256, 1, 1 179// CHECK: ld.global.nc.f 180 181// ----- 182 183HloModule MOF 184 185%fused_computation.4 (param_0: f32[672], param_1: f32[512,14,14,672]) -> (f32[512,14,14,672], f32[512,14,14,672]) { 186 %param_0 = f32[672]{0} parameter(0) 187 %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={3} 188 %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1) 189 %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %param_1) 190 ROOT tupe = (f32[512,14,14,672]{3,2,1,0}, f32[512,14,14,672]{3,2,1,0}) tuple(%add, %add) 191} 192 193ENTRY main { 194 %param_0 = f32[672]{0} parameter(0) 195 %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1) 196 197 ROOT %fusion.4 = (f32[512,14,14,672]{3,2,1,0}, f32[512,14,14,672]) fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.4 198} 199 200// Check that we didn't do anything. The block size didn't change. 201// CHECK-LABEL: fusion_4 202// CHECK: .reqntid 256, 1, 1 203// CHECK: ld.global.nc.f 204 205// ----- 206 207HloModule ScalarBroadcasting 208 209%fused_computation.5 (param_0: f32[], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] { 210 %param_0 = f32[] parameter(0) 211 %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={} 212 %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1) 213 ROOT %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %param_1) 214} 215 216ENTRY main { 217 %param_0 = f32[] parameter(0) 218 %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1) 219 220 ROOT %fusion.5 = f32[512,14,14,672] fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.5 221} 222 223// CHECK-LABEL: fusion_5 224// CHECK: .reqntid 128, 1, 1 225// CHECK: ld.global.nc.f 226 227// ----- 228 229HloModule NotSupportedBroadcasting 230 231%fused_computation.6 (param_0: f32[14,672], param_1: f32[512,14,14,672]) -> f32[512,14,14,672] { 232 %param_0 = f32[14,672]{1,0} parameter(0) 233 %broadcast = f32[512,14,14,672]{3,2,1,0} broadcast(%param_0), dimensions={2,3} 234 %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1) 235 ROOT %add = f32[512,14,14,672]{3,2,1,0} add(%broadcast, %param_1) 236} 237 238ENTRY main { 239 %param_0 = f32[14,672]{1,0} parameter(0) 240 %param_1 = f32[512,14,14,672]{3,2,1,0} parameter(1) 241 242 ROOT %fusion.6 = f32[512,14,14,672] fusion(%param_0, %param_1), kind=kLoop, calls=%fused_computation.6 243} 244 245// Check that we didn't do anything. The block size didn't change. 246// CHECK-LABEL: fusion_6 247// CHECK: .reqntid 256, 1, 1 248// CHECK: ld.global.nc.f 249 250// ----- 251HloModule Module 252 253%fused_computation.7 { 254 %constant_2 = f32[] constant(0) 255 %broadcast.1 = f32[32,7,7,352]{2,1,3,0} broadcast(f32[] %constant_2), dimensions={} 256 %param_1.2 = f32[32,7,7,320]{2,1,3,0} parameter(1) 257 %param_2.1 = f32[32,7,7,224]{2,1,3,0} parameter(2) 258 %param_3.1 = f32[32,7,7,128]{2,1,3,0} parameter(3) 259 %tmp_8.1 = f32[32,7,7,1024]{2,1,3,0} concatenate(f32[32,7,7,352]{2,1,3,0} %broadcast.1, f32[32,7,7,320]{2,1,3,0} %param_1.2, f32[32,7,7,224]{2,1,3,0} %param_2.1, f32[32,7,7,128]{2,1,3,0} %param_3.1), dimensions={3} 260 %param_0.1 = f32[32,7,7,1024]{2,1,3,0} parameter(0) 261 ROOT %tmp_10.1 = f32[32,7,7,1024]{2,1,3,0} add(f32[32,7,7,1024]{2,1,3,0} %tmp_8.1, f32[32,7,7,1024]{2,1,3,0} %param_0.1) 262} 263 264ENTRY %computation { 265 %tmp_0 = u8[32,224,224,3]{3,2,1,0} parameter(0) 266 %tmp_9 = f32[32,7,7,1024]{2,1,3,0} constant({...}) 267 %tmp_5 = f32[32,7,7,320]{2,1,3,0} constant({...}) 268 %tmp_6 = f32[32,7,7,224]{2,1,3,0} constant({...}) 269 %tmp_7 = f32[32,7,7,128]{2,1,3,0} constant({...}) 270 ROOT %fusion.7 = f32[32,7,7,1024]{2,1,3,0} fusion(f32[32,7,7,1024]{2,1,3,0} %tmp_9, f32[32,7,7,320]{2,1,3,0} %tmp_5, f32[32,7,7,224]{2,1,3,0} %tmp_6, f32[32,7,7,128]{2,1,3,0} %tmp_7), kind=kLoop, calls=%fused_computation.7 271} 272 273 274// This graph triggered a bug where the new indexing was generated 275// CHECK-LLVM-LABEL: @fusion_7 276// CHECK-LLVM-NOT: row_index 277 278// ----- 279HloModule RowToLong 280 281ENTRY main { 282 %param_0 = f32[2025]{0} parameter(0) 283 ROOT %broadcastRowToLong = f32[3025,2025]{1,0} broadcast(%param_0), dimensions={1} 284} 285// Check that we didn't emit the simpler row broadcasting. 286// CHECK-LLVM-LABEL: @broadcastRowToLong 287// CHECK-LLVM-NOT: row_index 288 289// ----- 290 291HloModule module 292 293ENTRY computation { 294 p0 = f16[5000,64,64,32] parameter(0) 295 zero = f16[] constant(0) 296 297 ROOT pad1 = f16[5000,65,65,32] pad(p0, zero), padding=0_0x0_1x0_1x0_0 298} 299 300// Check that we emit vectorized read. 301// CHECK: ld.global.nc.v4.f32 302