1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert CHANNEL_TILE % 4 == 0 7$assert KERNEL_TILE >= 2 8$assert ACCUMULATORS >= 1 9$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 10$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32" 11#include <assert.h> 12 13#include <arm_neon.h> 14 15#include <xnnpack/dwconv.h> 16 17 18void xnn_f32_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${"neonfma" if FMA else "neon"}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( 19 size_t channels, 20 size_t output_width, 21 const float** input, 22 const float* weights, 23 float* output, 24 size_t input_stride, 25 size_t output_increment, 26 size_t input_offset, 27 const float* zero, 28 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 29{ 30 assert(channels != 0); 31 assert(output_width != 0); 32 33 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); 34 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); 35 do { 36 $for K in range(KERNEL_TILE): 37 const float* i${K} = input[${K}]; 38 assert(i${K} != NULL); 39 if XNN_UNPREDICTABLE(i${K} != zero) { 40 i${K} = (const float*) ((uintptr_t) i${K} + input_offset); 41 } 42 43 input = (const float**) ((uintptr_t) input + input_stride); 44 45 size_t c = channels; 46 const float* w = weights; 47 for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) { 48 $for C in range(0, CHANNEL_TILE, 4): 49 float32x4_t vacc${ABC[C:C+4]}p0 = vld1q_f32(w); w += 4; 50 51 $for K in range(KERNEL_TILE): 52 53 $for C in range(0, CHANNEL_TILE, 4): 54 const float32x4_t vi${K}x${ABC[C:C+4]} = vld1q_f32(i${K}); i${K} += 4; 55 $for C in range(0, CHANNEL_TILE, 4): 56 const float32x4_t vk${K}x${ABC[C:C+4]} = vld1q_f32(w); w += 4; 57 $for C in range(0, CHANNEL_TILE, 4): 58 $if 1 <= K < ACCUMULATORS: 59 float32x4_t vacc${ABC[C:C+4]}p${K} = vmulq_f32(vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]}); 60 $else: 61 vacc${ABC[C:C+4]}p${K % ACCUMULATORS} = ${VMULADDQ_F32}(vacc${ABC[C:C+4]}p${K % ACCUMULATORS}, vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]}); 62 63 $if ACCUMULATORS > 1: 64 // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0 65 $ACC_STEP = 1 66 $while ACC_STEP < ACCUMULATORS: 67 $for A in range(0, ACCUMULATORS, ACC_STEP * 2): 68 $if A + ACC_STEP < ACCUMULATORS: 69 $for C in range(0, CHANNEL_TILE, 4): 70 vacc${ABC[C:C+4]}p${A} = vaddq_f32(vacc${ABC[C:C+4]}p${A}, vacc${ABC[C:C+4]}p${A + ACC_STEP}); 71 $ACC_STEP *= 2 72 73 $for C in range(0, CHANNEL_TILE, 4): 74 float32x4_t vacc${ABC[C:C+4]} = vmaxq_f32(vacc${ABC[C:C+4]}p0, vmin); 75 $for C in range(0, CHANNEL_TILE, 4): 76 vacc${ABC[C:C+4]} = vminq_f32(vacc${ABC[C:C+4]}, vmax); 77 78 $for C in range(0, CHANNEL_TILE, 4): 79 vst1q_f32(output, vacc${ABC[C:C+4]}); output += 4; 80 } 81 $if CHANNEL_TILE > 4: 82 for (; c >= 4; c -= 4) { 83 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; 84 85 $for K in range(KERNEL_TILE): 86 87 const float32x4_t vi${K}x0123 = vld1q_f32(i${K}); i${K} += 4; 88 const float32x4_t vk${K}x0123 = vld1q_f32(w + ${(K + 1) * CHANNEL_TILE - 4}); 89 $if 1 <= K < ACCUMULATORS: 90 float32x4_t vacc0123p${K} = vmulq_f32(vi${K}x0123, vk${K}x0123); 91 $else: 92 vacc0123p${K % ACCUMULATORS} = ${VMULADDQ_F32}(vacc0123p${K % ACCUMULATORS}, vi${K}x0123, vk${K}x0123); 93 94 $if ACCUMULATORS > 1: 95 // Add up all accumulators to vacc0123p0 96 $ACC_STEP = 1 97 $while ACC_STEP < ACCUMULATORS: 98 $for A in range(0, ACCUMULATORS, ACC_STEP * 2): 99 $if A + ACC_STEP < ACCUMULATORS: 100 vacc0123p${A} = vaddq_f32(vacc0123p${A}, vacc0123p${A + ACC_STEP}); 101 $ACC_STEP *= 2 102 103 float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); 104 vacc0123 = vminq_f32(vacc0123, vmax); 105 106 vst1q_f32(output, vacc0123); output += 4; 107 } 108 if XNN_UNLIKELY(c != 0) { 109 $if CHANNEL_TILE == 4: 110 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; 111 $else: 112 float32x4_t vacc0123p0 = vld1q_f32(w); 113 114 $for K in range(KERNEL_TILE): 115 116 const float32x4_t vi${K}x0123 = vld1q_f32(i${K}); 117 $if CHANNEL_TILE == 4: 118 const float32x4_t vk${K}x0123 = vld1q_f32(w); w += 4; 119 $else: 120 const float32x4_t vk${K}x0123 = vld1q_f32(w + ${(K + 1) * CHANNEL_TILE}); 121 $if 1 <= K < ACCUMULATORS: 122 float32x4_t vacc0123p${K} = vmulq_f32(vi${K}x0123, vk${K}x0123); 123 $else: 124 vacc0123p${K % ACCUMULATORS} = ${VMULADDQ_F32}(vacc0123p${K % ACCUMULATORS}, vi${K}x0123, vk${K}x0123); 125 126 $if ACCUMULATORS > 1: 127 // Add up all accumulators to vacc0123p0 128 $ACC_STEP = 1 129 $while ACC_STEP < ACCUMULATORS: 130 $for A in range(0, ACCUMULATORS, ACC_STEP * 2): 131 $if A + ACC_STEP < ACCUMULATORS: 132 vacc0123p${A} = vaddq_f32(vacc0123p${A}, vacc0123p${A + ACC_STEP}); 133 $ACC_STEP *= 2 134 135 float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); 136 vacc0123 = vminq_f32(vacc0123, vmax); 137 138 float32x2_t vacc01 = vget_low_f32(vacc0123); 139 if (c & 2) { 140 vst1_f32(output, vacc01); output += 2; 141 vacc01 = vget_high_f32(vacc0123); 142 } 143 if (c & 1) { 144 vst1_lane_f32(output, vacc01, 0); output += 1; 145 } 146 } 147 148 output = (float*) ((uintptr_t) output + output_increment); 149 } while (--output_width != 0); 150} 151