1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert BATCH_TILE % 4 == 0 7$assert BATCH_TILE >= 4 8$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 9#include <assert.h> 10 11#include <arm_neon.h> 12 13#include <xnnpack/common.h> 14#include <xnnpack/vbinary.h> 15 16 17void xnn_f32_hswish_ukernel__${"neonfma" if FMA else "neon"}_x${BATCH_TILE}( 18 size_t n, 19 const float* x, 20 float* y, 21 const union xnn_f32_hswish_params params[restrict static 1]) 22{ 23 assert(n != 0); 24 assert(n % sizeof(float) == 0); 25 26 const float32x4_t vsixth = vld1q_dup_f32(¶ms->scalar.sixth); 27 const float32x4_t vhalf = vld1q_dup_f32(¶ms->scalar.half); 28 const float32x4_t vone = vld1q_dup_f32(¶ms->scalar.one); 29 const float32x4_t vzero = vdupq_n_f32(0.0f); 30 31 for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) { 32 $for N in range(0, BATCH_TILE, 4): 33 const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4; 34 35 $for N in range(0, BATCH_TILE, 4): 36 $if FMA: 37 float32x4_t vacc${ABC[N:N+4]} = vfmaq_f32(vhalf, vx${ABC[N:N+4]}, vsixth); 38 $else: 39 float32x4_t vacc${ABC[N:N+4]} = vmlaq_f32(vhalf, vx${ABC[N:N+4]}, vsixth); 40 41 $for N in range(0, BATCH_TILE, 4): 42 vacc${ABC[N:N+4]} = vmaxq_f32(vacc${ABC[N:N+4]}, vzero); 43 44 $for N in range(0, BATCH_TILE, 4): 45 vacc${ABC[N:N+4]} = vminq_f32(vacc${ABC[N:N+4]}, vone); 46 47 $for N in range(0, BATCH_TILE, 4): 48 vacc${ABC[N:N+4]} = vmulq_f32(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]}); 49 50 $for N in range(0, BATCH_TILE, 4): 51 vst1q_f32(y, vacc${ABC[N:N+4]}); y += 4; 52 } 53 $if BATCH_TILE >= 4: 54 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { 55 const float32x4_t vx0123 = vld1q_f32(x); x += 4; 56 $if FMA: 57 float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth); 58 $else: 59 float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth); 60 vacc0123 = vmaxq_f32(vacc0123, vzero); 61 vacc0123 = vminq_f32(vacc0123, vone); 62 vacc0123 = vmulq_f32(vacc0123, vx0123); 63 vst1q_f32(y, vacc0123); y += 4; 64 } 65 if XNN_UNLIKELY(n != 0) { 66 const float32x4_t vx0123 = vld1q_f32(x); 67 $if FMA: 68 float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth); 69 $else: 70 float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth); 71 vacc0123 = vmaxq_f32(vacc0123, vzero); 72 vacc0123 = vminq_f32(vacc0123, vone); 73 vacc0123 = vmulq_f32(vacc0123, vx0123); 74 75 float32x2_t vacc01 = vget_low_f32(vacc0123); 76 if (n & (2 * sizeof(float))) { 77 vst1_f32(y, vacc01); y += 2; 78 vacc01 = vget_high_f32(vacc0123); 79 } 80 if (n & (1 * sizeof(float))) { 81 vst1_lane_f32(y, vacc01, 0); 82 } 83 } 84} 85