1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_ 16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_ 17 18 #include "ruy/profiler/instrumentation.h" // from @ruy 19 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h" 20 #include "tensorflow/lite/kernels/internal/types.h" 21 22 namespace tflite { 23 namespace optimized_ops { 24 25 // Implementation of float DepthwiseConv 26 27 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 28 struct FloatDepthwiseConvKernel {}; 29 30 #ifdef USE_NEON 31 32 template <> 33 struct FloatDepthwiseConvKernel<false, 8, 1> { 34 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 35 const float* input_ptr, int input_ptr_increment, 36 const float* filter_ptr, float* acc_buffer_ptr) { 37 // Load the filters 38 float32x4_t filter[2]; 39 for (int i = 0; i < 2; i++) { 40 filter[i] = vld1q_f32(filter_ptr + 4 * i); 41 } 42 int outp = 0; 43 // Handle 2 output pixels at a time. 44 for (; outp <= num_output_pixels - 2; outp += 2) { 45 // Load the inputs 46 float32x4_t input[4]; 47 for (int i = 0; i < 4; i++) { 48 input[i] = vld1q_f32(input_ptr + 4 * i); 49 } 50 input_ptr += 16; 51 // Load the accumulators from acc_buffer 52 float32x4_t acc[4]; 53 for (int i = 0; i < 4; i++) { 54 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 55 } 56 // Multiply-accumulate 57 acc[0] = vmlaq_f32(acc[0], input[0], filter[0]); 58 acc[1] = vmlaq_f32(acc[1], input[1], filter[1]); 59 acc[2] = vmlaq_f32(acc[2], input[2], filter[0]); 60 acc[3] = vmlaq_f32(acc[3], input[3], filter[1]); 61 // Store the accumulators back to acc_buffer 62 for (int i = 0; i < 4; i++) { 63 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 64 } 65 acc_buffer_ptr += 16; 66 } 67 // Handle one output pixel at a time. 68 for (; outp < num_output_pixels; outp++) { 69 // Load the inputs 70 float32x4_t input[2]; 71 for (int i = 0; i < 2; i++) { 72 input[i] = vld1q_f32(input_ptr + 4 * i); 73 } 74 input_ptr += 8; 75 // Load the accumulators from acc_buffer 76 float32x4_t acc[2]; 77 for (int i = 0; i < 2; i++) { 78 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 79 } 80 // Multiply-accumulate 81 for (int i = 0; i < 2; i++) { 82 acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); 83 } 84 // Store the accumulators back to acc_buffer 85 for (int i = 0; i < 2; i++) { 86 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 87 } 88 acc_buffer_ptr += 8; 89 } 90 } 91 }; 92 93 template <> 94 struct FloatDepthwiseConvKernel<false, 2, 1> { 95 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 96 const float* input_ptr, int input_ptr_increment, 97 const float* filter_ptr, float* acc_buffer_ptr) { 98 const float32x2_t filters = vld1_f32(filter_ptr); 99 const float32x4_t filters_dup2 = vcombine_f32(filters, filters); 100 int outp = 0; 101 // Handle 8 output pixels at a time. 102 for (; outp <= num_output_pixels - 8; outp += 8) { 103 // Load the inputs 104 float32x4_t input[4]; 105 for (int i = 0; i < 4; i++) { 106 input[i] = vld1q_f32(input_ptr + 4 * i); 107 } 108 input_ptr += 16; 109 // Load the accumulators from acc_buffer 110 float32x4_t acc[4]; 111 for (int i = 0; i < 4; i++) { 112 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 113 } 114 // Multiply-accumulate 115 for (int i = 0; i < 4; i++) { 116 acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); 117 } 118 // Store the accumulators back to acc_buffer 119 for (int i = 0; i < 4; i++) { 120 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 121 } 122 acc_buffer_ptr += 16; 123 } 124 // Handle 4 output pixels at a time. 125 for (; outp <= num_output_pixels - 4; outp += 4) { 126 // Load the inputs 127 float32x4_t input[2]; 128 for (int i = 0; i < 2; i++) { 129 input[i] = vld1q_f32(input_ptr + 4 * i); 130 } 131 input_ptr += 8; 132 // Load the accumulators from acc_buffer 133 float32x4_t acc[2]; 134 for (int i = 0; i < 2; i++) { 135 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 136 } 137 // Multiply-accumulate 138 for (int i = 0; i < 2; i++) { 139 acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); 140 } 141 // Store the accumulators back to acc_buffer 142 for (int i = 0; i < 2; i++) { 143 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 144 } 145 acc_buffer_ptr += 8; 146 } 147 // Handle 2 output pixels at a time. 148 for (; outp <= num_output_pixels - 2; outp += 2) { 149 // Load the inputs 150 const float32x4_t input = vld1q_f32(input_ptr); 151 input_ptr += 4; 152 // Load the accumulators from acc_buffer 153 float32x4_t acc = vld1q_f32(acc_buffer_ptr); 154 // Multiply-accumulate 155 acc = vmlaq_f32(acc, input, filters_dup2); 156 // Store the accumulators back to acc_buffer 157 vst1q_f32(acc_buffer_ptr, acc); 158 acc_buffer_ptr += 4; 159 } 160 // Handle 1 output pixel at a time 161 for (; outp < num_output_pixels; outp++) { 162 // Load the inputs 163 const float32x2_t input = vld1_f32(input_ptr); 164 input_ptr += 2; 165 // Load the accumulators from acc_buffer 166 float32x2_t acc = vld1_f32(acc_buffer_ptr); 167 // Multiply-accumulate 168 acc = vmla_f32(acc, input, filters); 169 // Store the accumulators back to acc_buffer 170 vst1_f32(acc_buffer_ptr, acc); 171 acc_buffer_ptr += 2; 172 } 173 } 174 }; 175 176 template <> 177 struct FloatDepthwiseConvKernel<true, 0, 1> { 178 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 179 const float* input_ptr, int input_ptr_increment, 180 const float* filter_ptr, float* acc_buffer_ptr) { 181 // Handle one output pixel at a time. 182 for (int outp = 0; outp < num_output_pixels; outp++) { 183 const float* local_filter_ptr = filter_ptr; 184 const float* local_input_ptr = input_ptr; 185 int ic = 0; 186 // Handle 16 input channels at a time. 187 for (; ic <= input_depth - 16; ic += 16) { 188 // Load the filters 189 float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0); 190 float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1); 191 float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2); 192 float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3); 193 local_filter_ptr += 16; 194 // Load the inputs 195 float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0); 196 float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1); 197 float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2); 198 float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3); 199 local_input_ptr += 16; 200 // Load the accumulators from acc_buffer 201 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); 202 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); 203 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); 204 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); 205 // Multiply-accumulate 206 acc_0 = vmlaq_f32(acc_0, input_0, filter_0); 207 acc_1 = vmlaq_f32(acc_1, input_1, filter_1); 208 acc_2 = vmlaq_f32(acc_2, input_2, filter_2); 209 acc_3 = vmlaq_f32(acc_3, input_3, filter_3); 210 // Store the accumulators back to acc_buffer 211 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); 212 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); 213 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); 214 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); 215 acc_buffer_ptr += 16; 216 } 217 // Handle 4 input channels at a time. 218 for (; ic <= input_depth - 4; ic += 4) { 219 // Load the filters 220 float32x4_t filter; 221 filter = vld1q_f32(local_filter_ptr); 222 local_filter_ptr += 4; 223 // Load the inputs 224 float32x4_t input; 225 input = vld1q_f32(local_input_ptr); 226 local_input_ptr += 4; 227 // Load the accumulators from acc_buffer 228 float32x4_t acc; 229 acc = vld1q_f32(acc_buffer_ptr); 230 // Multiply-accumulate 231 acc = vmlaq_f32(acc, input, filter); 232 // Store the accumulators back to acc_buffer 233 vst1q_f32(acc_buffer_ptr, acc); 234 acc_buffer_ptr += 4; 235 } 236 // Handle one input channel at a time. 237 for (; ic < input_depth; ic++) { 238 const float input_val = *local_input_ptr++; 239 const float filter_val = *local_filter_ptr++; 240 *acc_buffer_ptr++ += filter_val * input_val; 241 } 242 input_ptr += input_ptr_increment; 243 } 244 } 245 }; 246 247 template <> 248 struct FloatDepthwiseConvKernel<true, 0, 8> { 249 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 250 const float* input_ptr, int input_ptr_increment, 251 const float* filter_ptr, float* acc_buffer_ptr) { 252 // Handle one output pixel at a time. 253 for (int outp = 0; outp < num_output_pixels; outp++) { 254 const float* local_filter_ptr = filter_ptr; 255 const float* local_input_ptr = input_ptr; 256 int ic = 0; 257 // Handle 2 input channels at a time. 258 for (; ic <= input_depth - 2; ic += 2) { 259 // Load the filters 260 float32x4_t filter[4]; 261 for (int i = 0; i < 4; i++) { 262 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 263 } 264 local_filter_ptr += 16; 265 // Load the inputs 266 const float32x2_t input = vld1_f32(local_input_ptr); 267 local_input_ptr += 2; 268 // Load the accumulators from acc_buffer 269 float32x4_t acc[4]; 270 for (int i = 0; i < 4; i++) { 271 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 272 } 273 // Multiply-accumulate 274 acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0); 275 acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0); 276 acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1); 277 acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1); 278 // Store the accumulators back to acc_buffer 279 for (int i = 0; i < 4; i++) { 280 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 281 } 282 acc_buffer_ptr += 16; 283 } 284 // Handle one input channel at a time. 285 for (; ic < input_depth; ic++) { 286 // Load the filters 287 float32x4_t filter[2]; 288 for (int i = 0; i < 2; i++) { 289 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 290 } 291 local_filter_ptr += 8; 292 // Load the inputs 293 const float input_val = *local_input_ptr++; 294 // Load the accumulators from acc_buffer 295 float32x4_t acc[2]; 296 for (int i = 0; i < 2; i++) { 297 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 298 } 299 // Multiply-accumulate 300 for (int i = 0; i < 2; i++) { 301 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); 302 } 303 // Store the accumulators back to acc_buffer 304 for (int i = 0; i < 2; i++) { 305 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 306 } 307 acc_buffer_ptr += 8; 308 } 309 input_ptr += input_ptr_increment; 310 } 311 } 312 }; 313 314 // Note this implementation is very slow for input_depths < 8 315 // (e.g. comparable to reference implementation) see, specializations for 316 // input_depth=3 below. 317 template <> 318 struct FloatDepthwiseConvKernel<true, 0, 2> { 319 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 320 const float* input_ptr, int input_ptr_increment, 321 const float* filter_ptr, float* acc_buffer_ptr) { 322 // Handle one output pixel at a time. 323 for (int outp = 0; outp < num_output_pixels; outp++) { 324 const float* local_filter_ptr = filter_ptr; 325 const float* local_input_ptr = input_ptr; 326 int ic = 0; 327 // Handle 8 input channels at a time. 328 for (; ic <= input_depth - 8; ic += 8) { 329 // Load the filters 330 float32x4_t filter[4]; 331 for (int i = 0; i < 4; i++) { 332 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 333 } 334 local_filter_ptr += 16; 335 // Load the inputs 336 float32x4x2_t input_dup2[2]; 337 for (int i = 0; i < 2; i++) { 338 const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i); 339 input_dup2[i] = vzipq_f32(input, input); 340 } 341 local_input_ptr += 8; 342 // Load the accumulators from acc_buffer 343 float32x4_t acc[4]; 344 for (int i = 0; i < 4; i++) { 345 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 346 } 347 // Multiply-accumulate 348 acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]); 349 acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]); 350 acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]); 351 acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]); 352 // Store the accumulators back to acc_buffer 353 for (int i = 0; i < 4; i++) { 354 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 355 } 356 acc_buffer_ptr += 16; 357 } 358 // Handle 4 input channels at a time. 359 for (; ic <= input_depth - 4; ic += 4) { 360 // Load the filters 361 float32x2_t filter[4]; 362 for (int i = 0; i < 4; i++) { 363 filter[i] = vld1_f32(local_filter_ptr + 2 * i); 364 } 365 local_filter_ptr += 8; 366 // Load the inputs 367 const float32x4_t input = vld1q_f32(local_input_ptr); 368 local_input_ptr += 4; 369 // Load the accumulators from acc_buffer 370 float32x2_t acc[4]; 371 for (int i = 0; i < 4; i++) { 372 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); 373 } 374 // Multiply-accumulate 375 acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0); 376 acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1); 377 acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0); 378 acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1); 379 // Store the accumulators back to acc_buffer 380 for (int i = 0; i < 4; i++) { 381 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); 382 } 383 acc_buffer_ptr += 8; 384 } 385 // Handle 2 input channels at a time. 386 for (; ic <= input_depth - 2; ic += 2) { 387 // Load the filters 388 const float32x4_t filter = vld1q_f32(local_filter_ptr); 389 local_filter_ptr += 4; 390 // Load the inputs 391 const float32x2_t input = vld1_f32(local_input_ptr); 392 local_input_ptr += 2; 393 // Load the accumulators from acc_buffer 394 float32x2_t acc[2]; 395 for (int i = 0; i < 2; i++) { 396 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); 397 } 398 // Multiply-accumulate 399 acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0); 400 acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1); 401 // Store the accumulators back to acc_buffer 402 for (int i = 0; i < 2; i++) { 403 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); 404 } 405 acc_buffer_ptr += 4; 406 } 407 // Handle one input channel at a time. 408 for (; ic < input_depth; ic++) { 409 // Load the inputs 410 const float input_val = *local_input_ptr++; 411 // Multiply-accumulate 412 for (int i = 0; i < 2; i++) { 413 acc_buffer_ptr[i] += local_filter_ptr[i] * input_val; 414 } 415 local_filter_ptr += 2; 416 acc_buffer_ptr += 2; 417 } 418 input_ptr += input_ptr_increment; 419 } 420 } 421 }; 422 423 template <> 424 struct FloatDepthwiseConvKernel<true, 3, 2> { 425 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 426 const float* input_ptr, int input_ptr_increment, 427 const float* filter_ptr, float* acc_buffer_ptr) { 428 // Load the filters 429 float32x2_t filter[3]; 430 for (int i = 0; i < 3; i++) { 431 filter[i] = vld1_f32(filter_ptr + 2 * i); 432 } 433 // Handle one output pixel at a time. 434 for (int outp = 0; outp < num_output_pixels; outp++) { 435 const float32x2_t input01 = vld1_f32(input_ptr); 436 const float32x2_t input2 = vld1_dup_f32(input_ptr + 2); 437 // Load the accumulators from acc_buffer 438 float32x2_t acc[3]; 439 for (int i = 0; i < 3; i++) { 440 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); 441 } 442 // Multiply-accumulate for each input channel there 2 outputs 443 acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0); 444 acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1); 445 acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0); 446 // Store the accumulators back to acc_buffer 447 for (int i = 0; i < 3; i++) { 448 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); 449 } 450 acc_buffer_ptr += 6; 451 input_ptr += input_ptr_increment; 452 } 453 } 454 }; 455 456 template <> 457 struct FloatDepthwiseConvKernel<true, 3, 4> { 458 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 459 const float* input_ptr, int input_ptr_increment, 460 const float* filter_ptr, float* acc_buffer_ptr) { 461 // Load the filters 462 float32x4_t filter[3]; 463 for (int i = 0; i < 3; i++) { 464 filter[i] = vld1q_f32(filter_ptr + 4 * i); 465 } 466 // Handle one output pixel at a time. 467 for (int outp = 0; outp < num_output_pixels; outp++) { 468 // NOTE: we only want 3 values, so we read it as two ops where 469 // the second op just duplicates the lane 470 const float32x2_t input01 = vld1_f32(input_ptr); 471 const float32x2_t input2 = vld1_dup_f32(input_ptr + 2); 472 // Load the accumulators from acc_buffer 473 float32x4_t acc[3]; 474 for (int i = 0; i < 3; i++) { 475 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 476 } 477 // Multiply-accumulate all outputs. 478 acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0); 479 acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1); 480 acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0); 481 // Store the accumulators back to acc_buffer 482 for (int i = 0; i < 3; i++) { 483 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 484 } 485 acc_buffer_ptr += 12; 486 input_ptr += input_ptr_increment; 487 } 488 } 489 }; 490 491 template <> 492 struct FloatDepthwiseConvKernel<true, 1, 8> { 493 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 494 const float* input_ptr, int input_ptr_increment, 495 const float* filter_ptr, float* acc_buffer_ptr) { 496 // Load the filters 497 float32x4_t filter[2]; 498 for (int i = 0; i < 2; i++) { 499 filter[i] = vld1q_f32(filter_ptr + 4 * i); 500 } 501 // Handle one output pixel at a time. 502 for (int outp = 0; outp < num_output_pixels; outp++) { 503 // Load the inputs 504 const float input_val = *input_ptr; 505 input_ptr += input_ptr_increment; 506 // Load the accumulators from acc_buffer 507 float32x4_t acc[2]; 508 for (int i = 0; i < 2; i++) { 509 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 510 } 511 // Multiply-accumulate 512 for (int i = 0; i < 2; i++) { 513 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); 514 } 515 // Store the accumulators back to acc_buffer 516 for (int i = 0; i < 2; i++) { 517 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 518 } 519 acc_buffer_ptr += 8; 520 } 521 } 522 }; 523 524 template <> 525 struct FloatDepthwiseConvKernel<true, 1, 32> { 526 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 527 const float* input_ptr, int input_ptr_increment, 528 const float* filter_ptr, float* acc_buffer_ptr) { 529 // Load the filters 530 float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0); 531 float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1); 532 float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2); 533 float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3); 534 float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4); 535 float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5); 536 float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6); 537 float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7); 538 539 // Handle one output pixel at a time. 540 for (int outp = 0; outp < num_output_pixels; outp++) { 541 // Load the inputs 542 const float input_val = *input_ptr; 543 input_ptr += input_ptr_increment; 544 // Load the accumulators from acc_buffer 545 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); 546 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); 547 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); 548 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); 549 float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4); 550 float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5); 551 float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6); 552 float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7); 553 // Multiply-accumulate 554 acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val); 555 acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val); 556 acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val); 557 acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val); 558 acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val); 559 acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val); 560 acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val); 561 acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val); 562 // Store the accumulators back to acc_buffer 563 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); 564 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); 565 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); 566 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); 567 vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4); 568 vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5); 569 vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6); 570 vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7); 571 acc_buffer_ptr += 32; 572 } 573 } 574 }; 575 576 template <> 577 struct FloatDepthwiseConvKernel<true, 1, 20> { 578 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 579 const float* input_ptr, int input_ptr_increment, 580 const float* filter_ptr, float* acc_buffer_ptr) { 581 // Load the filters 582 float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0); 583 float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1); 584 float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2); 585 float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3); 586 float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4); 587 588 // Handle one output pixel at a time. 589 for (int outp = 0; outp < num_output_pixels; outp++) { 590 // Load the inputs 591 const float input_val = *input_ptr; 592 input_ptr += input_ptr_increment; 593 // Load the accumulators from acc_buffer 594 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); 595 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); 596 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); 597 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); 598 float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4); 599 // Multiply-accumulate 600 acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val); 601 acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val); 602 acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val); 603 acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val); 604 acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val); 605 // Store the accumulators back to acc_buffer 606 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); 607 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); 608 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); 609 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); 610 vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4); 611 acc_buffer_ptr += 20; 612 } 613 } 614 }; 615 616 template <> 617 struct FloatDepthwiseConvKernel<true, 0, 16> { 618 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 619 const float* input_ptr, int input_ptr_increment, 620 const float* filter_ptr, float* acc_buffer_ptr) { 621 // Handle one output pixel at a time. 622 for (int outp = 0; outp < num_output_pixels; outp++) { 623 const float* local_filter_ptr = filter_ptr; 624 const float* local_input_ptr = input_ptr; 625 for (int ic = 0; ic < input_depth; ic++) { 626 // Load the filters 627 float32x4_t filter[4]; 628 for (int i = 0; i < 4; i++) { 629 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 630 } 631 local_filter_ptr += 16; 632 // Load the inputs 633 const float input_val = *local_input_ptr++; 634 // Load the accumulators from acc_buffer 635 float32x4_t acc[4]; 636 for (int i = 0; i < 4; i++) { 637 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 638 } 639 // Multiply-accumulate 640 for (int i = 0; i < 4; i++) { 641 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); 642 } 643 // Store the accumulators back to acc_buffer 644 for (int i = 0; i < 4; i++) { 645 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 646 } 647 acc_buffer_ptr += 16; 648 } 649 input_ptr += input_ptr_increment; 650 } 651 } 652 }; 653 654 template <> 655 struct FloatDepthwiseConvKernel<true, 8, 1> { 656 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 657 const float* input_ptr, int input_ptr_increment, 658 const float* filter_ptr, float* acc_buffer_ptr) { 659 // Load the filters 660 float32x4_t filter[2]; 661 for (int i = 0; i < 2; i++) { 662 filter[i] = vld1q_f32(filter_ptr + 4 * i); 663 } 664 // Handle one output pixel at a time. 665 for (int outp = 0; outp < num_output_pixels; outp++) { 666 // Load the inputs 667 float32x4_t input[2]; 668 for (int i = 0; i < 2; i++) { 669 input[i] = vld1q_f32(input_ptr + 4 * i); 670 } 671 // Load the accumulators from acc_buffer 672 float32x4_t acc[2]; 673 for (int i = 0; i < 2; i++) { 674 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 675 } 676 // Multiply-accumulate 677 for (int i = 0; i < 2; i++) { 678 acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); 679 } 680 // Store the accumulators back to acc_buffer 681 for (int i = 0; i < 2; i++) { 682 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 683 } 684 acc_buffer_ptr += 8; 685 input_ptr += input_ptr_increment; 686 } 687 } 688 }; 689 690 template <> 691 struct FloatDepthwiseConvKernel<true, 2, 1> { 692 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 693 const float* input_ptr, int input_ptr_increment, 694 const float* filter_ptr, float* acc_buffer_ptr) { 695 float32x2_t filter = vld1_f32(filter_ptr); 696 float32x4_t filter_x4 = vcombine_f32(filter, filter); 697 int outp = 0; 698 699 // Handle two output pixels at a time. 700 for (; outp <= num_output_pixels - 2; outp += 2) { 701 // Load the inputs 702 float32x2_t input_1 = vld1_f32(input_ptr); 703 input_ptr += input_ptr_increment; 704 float32x2_t input_2 = vld1_f32(input_ptr); 705 input_ptr += input_ptr_increment; 706 float32x4_t input = vcombine_f32(input_1, input_2); 707 708 // Load the accumulators from acc_buffer 709 float32x4_t acc = vld1q_f32(acc_buffer_ptr); 710 711 // Multiply-accumulate 712 acc = vmlaq_f32(acc, input, filter_x4); 713 714 // Store the accumulators back to acc_buffer 715 vst1q_f32(acc_buffer_ptr, acc); 716 acc_buffer_ptr += 4; 717 } 718 // Handle one output pixel at a time. 719 for (; outp < num_output_pixels; outp++) { 720 // Load the inputs 721 float32x2_t input = vld1_f32(input_ptr); 722 input_ptr += input_ptr_increment; 723 724 // Load the accumulators from acc_buffer 725 float32x2_t acc = vld1_f32(acc_buffer_ptr); 726 727 // Multiply-accumulate 728 acc = vmla_f32(acc, input, filter); 729 730 // Store the accumulators back to acc_buffer 731 vst1_f32(acc_buffer_ptr, acc); 732 acc_buffer_ptr += 2; 733 } 734 } 735 }; 736 737 template <> 738 struct FloatDepthwiseConvKernel<true, 4, 1> { 739 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 740 const float* input_ptr, int input_ptr_increment, 741 const float* filter_ptr, float* acc_buffer_ptr) { 742 float32x4_t filter = vld1q_f32(filter_ptr); 743 744 // Handle one output pixel at a time. 745 for (int outp = 0; outp < num_output_pixels; outp++) { 746 // Load the inputs 747 float32x4_t input = vld1q_f32(input_ptr); 748 // Load the accumulators from acc_buffer 749 float32x4_t acc = vld1q_f32(acc_buffer_ptr); 750 // Multiply-accumulate 751 acc = vmlaq_f32(acc, input, filter); 752 // Store the accumulators back to acc_buffer 753 vst1q_f32(acc_buffer_ptr, acc); 754 acc_buffer_ptr += 4; 755 input_ptr += input_ptr_increment; 756 } 757 } 758 }; 759 #endif 760 761 // Accumulates the effect of one row of the filter, on a segment of one row 762 // of the output, accessing the corresponding one row of the input. 763 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 764 void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, 765 int input_depth, int input_width, 766 const float* input_data, int pad_width, 767 int depth_multiplier, int filter_width, 768 const float* filter_data, 769 int out_x_buffer_start, int out_x_buffer_end, 770 int output_depth, float* acc_buffer) { 771 ruy::profiler::ScopeLabel label(__PRETTY_FUNCTION__); 772 // Consistency check parameters. This is important in particular to ensure 773 // that we keep the number of template instantiations minimal, so we don't 774 // increase binary size unnecessarily. 775 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); 776 static_assert(kFixedInputDepth || kAllowStrided, ""); 777 TFLITE_DCHECK(stride == 1 || kAllowStrided); 778 if (kFixedInputDepth) { 779 TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth); 780 } 781 if (kFixedDepthMultiplier) { 782 TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); 783 } 784 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); 785 const int input_ptr_increment = stride * input_depth; 786 const float* filter_base_ptr = filter_data; 787 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 788 // For the current (filter_x, filter_y) point in the filter, 789 // compute the boundaries of the corresponding output row segment. 790 int out_x_loop_start_unclamped = 0; 791 int out_x_loop_end_unclamped = 0; 792 if (kAllowStrided) { 793 if (stride == 2) { 794 out_x_loop_start_unclamped = 795 (pad_width - dilation_factor * filter_x + 1) / 2; 796 out_x_loop_end_unclamped = 797 (pad_width + input_width - dilation_factor * filter_x + 1) / 2; 798 } else if (stride == 4) { 799 out_x_loop_start_unclamped = 800 (pad_width - dilation_factor * filter_x + 3) / 4; 801 out_x_loop_end_unclamped = 802 (pad_width + input_width - dilation_factor * filter_x + 3) / 4; 803 } else { 804 out_x_loop_start_unclamped = 805 (pad_width - dilation_factor * filter_x + stride - 1) / stride; 806 out_x_loop_end_unclamped = (pad_width + input_width - 807 dilation_factor * filter_x + stride - 1) / 808 stride; 809 } 810 } else { 811 out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x; 812 out_x_loop_end_unclamped = 813 pad_width + input_width - dilation_factor * filter_x; 814 } 815 // The kernel will have to iterate on the segment of the 816 // output row that starts at out_x_loop_start and out_x_loop_end. 817 const int out_x_loop_start = 818 std::max(out_x_buffer_start, out_x_loop_start_unclamped); 819 const int out_x_loop_end = 820 std::min(out_x_buffer_end, out_x_loop_end_unclamped); 821 822 float* acc_buffer_ptr = 823 acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 824 const int in_x_origin = 825 (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; 826 const float* input_ptr = input_data + in_x_origin * input_depth; 827 const int num_output_pixels = out_x_loop_end - out_x_loop_start; 828 FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, 829 kFixedDepthMultiplier>::Run(num_output_pixels, 830 input_depth, 831 depth_multiplier, 832 input_ptr, 833 input_ptr_increment, 834 filter_base_ptr, 835 acc_buffer_ptr); 836 filter_base_ptr += output_depth; 837 } 838 } 839 840 // generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized. 841 inline void FloatDepthwiseConvAccumRowGeneric( 842 int stride, int dilation_factor, int input_depth, int input_width, 843 const float* input_data, int pad_width, int depth_multiplier, 844 int filter_width, const float* filter_data, int out_x_buffer_start, 845 int out_x_buffer_end, int output_depth, float* acc_buffer) { 846 ruy::profiler::ScopeLabel label("DepthwiseConvAccumRowGeneric (slow)"); 847 const float* filter_base_ptr = filter_data; 848 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 849 const int out_x_loop_start = std::max( 850 out_x_buffer_start, 851 (pad_width - dilation_factor * filter_x + stride - 1) / stride); 852 const int out_x_loop_end = std::min( 853 out_x_buffer_end, 854 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / 855 stride); 856 857 float* acc_buffer_ptr = 858 acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 859 const int in_x_origin = 860 (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; 861 const float* input_ptr = input_data + in_x_origin * input_depth; 862 const int input_ptr_increment = (stride - 1) * input_depth; 863 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { 864 const float* filter_ptr = filter_base_ptr; 865 for (int ic = 0; ic < input_depth; ++ic) { 866 const float input_val = *input_ptr++; 867 for (int m = 0; m < depth_multiplier; m++) { 868 const float filter_val = *filter_ptr++; 869 *acc_buffer_ptr++ += filter_val * input_val; 870 } 871 } 872 input_ptr += input_ptr_increment; 873 } 874 filter_base_ptr += output_depth; 875 } 876 } 877 878 // Initializes the accumulator buffer with bias values. 879 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, 880 const float* bias_data, 881 float* acc_buffer) { 882 // TODO(benoitjacob): This might need optimized specializations 883 // for small output_depth values, if that ever becomes an important 884 // case (like it was for some quantized DepthwiseConv cases). 885 for (int i = 0; i < num_output_pixels; i++) { 886 memcpy(acc_buffer + i * output_depth, bias_data, 887 sizeof(acc_buffer[0]) * output_depth); 888 } 889 } 890 891 // DepthwiseConv can run with multi threads on the dim specified by thread_dim. 892 // Each thread processes output elements on dim, thread_dim, in the range of 893 // [thread_start, thread_end). 894 // For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it 895 // means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :]. 896 // 897 // The cpu_flags is currently unused. This 898 // parameter is included so that the signature matches that required by a 899 // templated function. Other versions, such as quantized, need this parameter. 900 inline void DepthwiseConvImpl( 901 const DepthwiseParams& params, const RuntimeShape& input_shape, 902 const float* input_data, const RuntimeShape& filter_shape, 903 const float* filter_data, const RuntimeShape& bias_shape, 904 const float* bias_data, const RuntimeShape& output_shape, 905 float* output_data, const CpuFlags& /* cpu_flags */, int thread_start, 906 int thread_end, int thread_dim) { 907 ruy::profiler::ScopeLabel label("DepthwiseConv/float/DepthwiseConvImpl"); 908 909 const int stride_width = params.stride_width; 910 const int stride_height = params.stride_height; 911 const int pad_width = params.padding_values.width; 912 const int pad_height = params.padding_values.height; 913 const int depth_multiplier = params.depth_multiplier; 914 const float output_activation_min = params.float_activation_min; 915 const float output_activation_max = params.float_activation_max; 916 const int dilation_width_factor = params.dilation_width_factor; 917 const int dilation_height_factor = params.dilation_height_factor; 918 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); 919 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); 920 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); 921 TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1); 922 923 const int batches = MatchingDim(input_shape, 0, output_shape, 0); 924 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); 925 const int input_height = input_shape.Dims(1); 926 const int input_width = input_shape.Dims(2); 927 const int input_depth = input_shape.Dims(3); 928 const int filter_height = filter_shape.Dims(1); 929 const int filter_width = filter_shape.Dims(2); 930 const int output_height = output_shape.Dims(1); 931 const int output_width = output_shape.Dims(2); 932 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); 933 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); 934 935 static const int kAccBufferMaxSize = 4832; 936 float acc_buffer[kAccBufferMaxSize]; 937 TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth); 938 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; 939 const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; 940 TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, 941 kAccBufferActualSize); 942 TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); 943 TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1); 944 945 // row_accum_func will point to the core accumulation function to be used 946 // for this DepthwiseConv op. 947 using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric); 948 row_accum_func_t row_accum_func = nullptr; 949 950 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 951 FIXED_DEPTH_MULTIPLIER) \ 952 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ 953 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ 954 depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ 955 row_accum_func = \ 956 FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 957 FIXED_DEPTH_MULTIPLIER>; \ 958 } 959 960 #ifdef USE_NEON 961 // We go over our list of kernels by decreasing order of preference 962 // for the cases where multiple kernels could apply. 963 964 // Start with the fastest kernels: AllowStrided=false, fixed input depth. 965 966 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) 967 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) 968 969 // Next come the strided kernels: AllowStrided=true, fixed input depth. 970 // They are a bit less efficient, but allow stride!=1. 971 972 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) 973 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) 974 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) 975 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) 976 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) 977 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2) 978 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4) 979 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) 980 981 // Finally, the kernels allowing a variable input depth, 982 // these are the least efficient but most general kernels. 983 984 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) 985 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) 986 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8) 987 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16) 988 989 #endif // USE_NEON 990 991 #undef TFMINI_USE_DEPTHWISECONV_KERNEL 992 993 // No matching fast kernel found, use slow fallback. 994 if (!row_accum_func) { 995 row_accum_func = FloatDepthwiseConvAccumRowGeneric; 996 } 997 998 const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); 999 const int input_batch_stride = input_height_stride * input_shape.Dims(1); 1000 const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); 1001 1002 // Now that we have determined row_accum_func, we can start work. 1003 int batch_start = 0; 1004 int batch_end = batches; 1005 int row_start = 0; 1006 int row_end = output_height; 1007 int output_ptr_offset = 0; 1008 1009 switch (thread_dim) { 1010 case 0: 1011 // Multithread along with the batch axis 1012 TFLITE_DCHECK_GE(thread_start, 0); 1013 TFLITE_DCHECK_LE(thread_end, batches); 1014 batch_start = thread_start; 1015 batch_end = thread_end; 1016 output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0); 1017 break; 1018 case 1: 1019 // Multithread along with the row axis 1020 TFLITE_DCHECK_GE(thread_start, 0); 1021 TFLITE_DCHECK_LE(thread_end, output_height); 1022 row_start = thread_start; 1023 row_end = thread_end; 1024 output_ptr_offset = row_start * output_width * output_depth; 1025 break; 1026 } 1027 1028 float* output_ptr = output_data + output_ptr_offset; 1029 int batch_step = 1030 (output_height + row_start - row_end) * output_width * output_depth; 1031 1032 for (int b = batch_start; b < batch_end; ++b) { 1033 for (int out_y = row_start; out_y < row_end; ++out_y) { 1034 const int in_y_origin = (out_y * stride_height) - pad_height; 1035 const int filter_y_start = 1036 std::max(0, (-in_y_origin + dilation_height_factor - 1) / 1037 dilation_height_factor); 1038 const int filter_y_end = 1039 std::min(filter_height, 1040 (input_height - in_y_origin + dilation_height_factor - 1) / 1041 dilation_height_factor); 1042 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; 1043 out_x_buffer_start += kOutputPixelsInAccBuffer) { 1044 const int out_x_buffer_end = std::min( 1045 output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); 1046 // We call a 'pixel' a group of activation that share all but the 1047 // 'depth'/'channel' coordinate. num_output_pixels is the number of 1048 // output pixels that we will accumulate in this loop iteration. 1049 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; 1050 // Initialize our local accumulator with the bias values, so we don't 1051 // have to add them later. 1052 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, 1053 acc_buffer); 1054 // Accumulation loop. Most of the time should be spent in here. 1055 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1056 ++filter_y) { 1057 const int in_y = in_y_origin + dilation_height_factor * filter_y; 1058 row_accum_func( 1059 stride_width, dilation_width_factor, input_depth, input_width, 1060 input_data + in_y * input_height_stride + b * input_batch_stride, 1061 pad_width, depth_multiplier, filter_width, 1062 filter_data + filter_y * filter_height_stride, out_x_buffer_start, 1063 out_x_buffer_end, output_depth, acc_buffer); 1064 } 1065 // Finished accumulating. Now store to destination. 1066 const int num_output_values = output_depth * num_output_pixels; 1067 int i = 0; 1068 // TODO(benoitjacob) optimized code goes here 1069 #ifdef USE_NEON 1070 // Handle 16 values at a time 1071 for (; i <= num_output_values - 16; i += 16) { 1072 float32x4_t acc[4]; 1073 for (int k = 0; k < 4; k++) { 1074 acc[k] = vld1q_f32(acc_buffer + i + 4 * k); 1075 } 1076 for (int k = 0; k < 4; k++) { 1077 acc[k] = vmaxq_f32( 1078 vdupq_n_f32(output_activation_min), 1079 vminq_f32(vdupq_n_f32(output_activation_max), acc[k])); 1080 } 1081 for (int k = 0; k < 4; k++) { 1082 vst1q_f32(output_ptr + 4 * k, acc[k]); 1083 } 1084 output_ptr += 16; 1085 } 1086 // Handle 4 values at a time 1087 for (; i <= num_output_values - 4; i += 4) { 1088 float32x4_t acc = vld1q_f32(acc_buffer + i); 1089 1090 acc = vmaxq_f32(vdupq_n_f32(output_activation_min), 1091 vminq_f32(vdupq_n_f32(output_activation_max), acc)); 1092 1093 vst1q_f32(output_ptr, acc); 1094 output_ptr += 4; 1095 } 1096 #endif 1097 // Handle leftover values, one by one. This is very slow. 1098 for (; i < num_output_values; i++) { 1099 float acc = acc_buffer[i]; 1100 acc = std::max(output_activation_min, 1101 std::min(output_activation_max, acc)); 1102 1103 *output_ptr++ = acc; 1104 } 1105 } 1106 } 1107 output_ptr += batch_step; 1108 } 1109 } 1110 1111 1112 } // namespace optimized_ops 1113 } // namespace tflite 1114 1115 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_ 1116