1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert BATCH_TILE % 4 == 0 7$assert BATCH_TILE >= 4 8$assert RR_STEPS in [1, 2] 9$assert DIV_ALGO in ["div", "nr2fma", "nr2recps", "nr1recps1fma"] 10$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 11$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32" 12#include <assert.h> 13 14#include <arm_neon.h> 15 16#include <xnnpack/common.h> 17#include <xnnpack/vunary.h> 18 19 20extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048]; 21 22void xnn_f32_sigmoid_ukernel__${"neonfma" if FMA else "neon"}_rr${RR_STEPS}_lut2048_p1_${DIV_ALGO}_x${BATCH_TILE}( 23 size_t n, 24 const float* x, 25 float* y, 26 const void* params) 27{ 28 assert(n % sizeof(float) == 0); 29 30 const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f); 31 // The largest z for which sigmoidf(-z) is normalized. 32 // This number is also the largest z for which expf(-z) is normalized. 33 const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f); 34 const float32x4_t vminus_log2e_x2048 = vmovq_n_f32(-0x1.715476p11f); 35 $if RR_STEPS == 1: 36 const float32x4_t vln2_o2048 = vmovq_n_f32(0x1.62E43p-12f); 37 $else: 38 $if FMA: 39 const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f); 40 const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f); 41 $else: 42 // Last 18 bits are zeroes 43 const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.600000p-12f); 44 const float32x4_t vln2_o2048_lo = vmovq_n_f32(0x1.7217F8p-19f); 45 const float32x4_t vone = vmovq_n_f32(1.0f); 46 47 const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f); 48 49 const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF)); 50 51 $if BATCH_TILE > 4: 52 for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) { 53 $for N in range(0, BATCH_TILE, 4): 54 const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4; 55 56 // General structure of the algorithm: 57 // / exp(x) / (1 + exp(x)) if x <= 0 58 // f[x] := 59 // \ 1 - f[-x] if x >= 0 60 // 61 // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x), 62 // then replace result with 1 - f[-z] if x >= 0. 63 $for N in range(0, BATCH_TILE, 4): 64 const float32x4_t vz${ABC[N:N+4]} = vabsq_f32(vx${ABC[N:N+4]}); 65 66 // Compute reduced argument n := round(-z * 2048 / log(2)). 67 // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing 68 // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction. 69 // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e. 70 // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of 71 // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result 72 // for such inputs at the very end of the algorithm. 73 $for N in range(0, BATCH_TILE, 4): 74 float32x4_t vn${ABC[N:N+4]} = ${VMULADDQ_F32}(vmagic_bias, vz${ABC[N:N+4]}, vminus_log2e_x2048); 75 76 // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is 77 // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) = 78 // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps: 79 // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the 80 // fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0. 81 // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized 82 // number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0, 83 // and thus the adjusted exponent is not lower than -126. 84 // 85 // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent). 86 $for N in range(0, BATCH_TILE, 4): 87 const int32x4_t ve${ABC[N:N+4]} = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn${ABC[N:N+4]}), vmovq_n_s32(INT32_C(0x7FF))), 12); 88 89 // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048). 90 $for N in range(0, BATCH_TILE, 4): 91 const uint64x2_t vidx${ABC[N:N+4]} = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn${ABC[N:N+4]}), vindex_mask)); 92 93 $for N in range(0, BATCH_TILE, 4): 94 const uint64_t vidx${ABC[N:N+2]} = vgetq_lane_u64(vidx${ABC[N:N+4]}, 0); 95 const uint64_t vidx${ABC[N+2:N+4]} = vgetq_lane_u64(vidx${ABC[N:N+4]}, 1); 96 float32x2_t vl${ABC[N:N+2]} = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx${ABC[N:N+2]}]); 97 float32x2_t vl${ABC[N+2:N+4]} = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx${ABC[N+2:N+4]}]); 98 99 $for N in range(0, BATCH_TILE, 4): 100 vl${ABC[N:N+2]} = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx${ABC[N:N+2]} >> 32)], vl${ABC[N:N+2]}, 1); 101 vl${ABC[N+2:N+4]} = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx${ABC[N+2:N+4]} >> 32)], vl${ABC[N+2:N+4]}, 1); 102 const float32x4_t vl${ABC[N:N+4]} = vcombine_f32(vl${ABC[N:N+2]}, vl${ABC[N+2:N+4]}); 103 104 // Adjust exponent of the value l fetched from the table to get the final s value. 105 $for N in range(0, BATCH_TILE, 4): 106 const float32x4_t vs${ABC[N:N+4]} = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl${ABC[N:N+4]}), ve${ABC[N:N+4]})); 107 108 // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number. 109 $for N in range(0, BATCH_TILE, 4): 110 vn${ABC[N:N+4]} = vsubq_f32(vn${ABC[N:N+4]}, vmagic_bias); 111 112 // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048. 113 $if RR_STEPS == 1: 114 $for N in range(0, BATCH_TILE, 4): 115 float32x4_t vt${ABC[N:N+4]} = ${VMULADDQ_F32}(vz${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_o2048); 116 $else: 117 // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy. 118 $for N in range(0, BATCH_TILE, 4): 119 float32x4_t vt${ABC[N:N+4]} = ${VMULADDQ_F32}(vz${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_o2048_hi); 120 121 $for N in range(0, BATCH_TILE, 4): 122 vt${ABC[N:N+4]} = ${VMULADDQ_F32}(vt${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_o2048_lo); 123 124 // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]: 125 // P1(t) = 1 + t * c1 126 $for N in range(0, BATCH_TILE, 4): 127 const float32x4_t vp${ABC[N:N+4]} = vmulq_f32(vt${ABC[N:N+4]}, vc1); 128 129 // Reconstruct the exp(-z) value: 130 // y = s * (1 + t * c1) 131 // = s + s * (t * c1)) 132 // = s + s * p 133 $for N in range(0, BATCH_TILE, 4): 134 const float32x4_t vy${ABC[N:N+4]} = ${VMULADDQ_F32}(vs${ABC[N:N+4]}, vs${ABC[N:N+4]}, vp${ABC[N:N+4]}); 135 136 // Denominator of the sigmoid fraction: 1.0 + exp(-z) 137 $for N in range(0, BATCH_TILE, 4): 138 const float32x4_t vd${ABC[N:N+4]} = vaddq_f32(vy${ABC[N:N+4]}, vone); 139 140 $if DIV_ALGO == "div": 141 // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z)) 142 $for N in range(0, BATCH_TILE, 4): 143 float32x4_t vf${ABC[N:N+4]} = vdivq_f32(vy${ABC[N:N+4]}, vd${ABC[N:N+4]}); 144 $else: 145 // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator. 146 // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0. 147 // Thus the reciprocal of the denominator never overflows. 148 $for N in range(0, BATCH_TILE, 4): 149 float32x4_t vr${ABC[N:N+4]} = vrecpeq_f32(vd${ABC[N:N+4]}); 150 151 $if DIV_ALGO == "nr2fma": 152 $for N in range(0, BATCH_TILE, 4): 153 vr${ABC[N:N+4]} = vfmaq_f32(vr${ABC[N:N+4]}, vr${ABC[N:N+4]}, vfmsq_f32(vone, vr${ABC[N:N+4]}, vd${ABC[N:N+4]})); 154 $else: 155 $for N in range(0, BATCH_TILE, 4): 156 vr${ABC[N:N+4]} = vmulq_f32(vr${ABC[N:N+4]}, vrecpsq_f32(vr${ABC[N:N+4]}, vd${ABC[N:N+4]})); 157 158 $if DIV_ALGO == "nr2recps": 159 $for N in range(0, BATCH_TILE, 4): 160 vr${ABC[N:N+4]} = vmulq_f32(vr${ABC[N:N+4]}, vrecpsq_f32(vr${ABC[N:N+4]}, vd${ABC[N:N+4]})); 161 $else: 162 $for N in range(0, BATCH_TILE, 4): 163 vr${ABC[N:N+4]} = vfmaq_f32(vr${ABC[N:N+4]}, vr${ABC[N:N+4]}, vfmsq_f32(vone, vr${ABC[N:N+4]}, vd${ABC[N:N+4]})); 164 165 // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z)) 166 $for N in range(0, BATCH_TILE, 4): 167 float32x4_t vf${ABC[N:N+4]} = vmulq_f32(vy${ABC[N:N+4]}, vr${ABC[N:N+4]}); 168 169 // For inputs below denormal cutoff, replace output with +0.0f. 170 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. 171 $for N in range(0, BATCH_TILE, 4): 172 vf${ABC[N:N+4]} = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf${ABC[N:N+4]}), vcagtq_f32(vx${ABC[N:N+4]}, vdenorm_cutoff))); 173 174 // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z) 175 $for N in range(0, BATCH_TILE, 4): 176 const uint32x4_t vm${ABC[N:N+4]} = vcltq_f32(vx${ABC[N:N+4]}, vmovq_n_f32(0.0f)); 177 178 $for N in range(0, BATCH_TILE, 4): 179 vf${ABC[N:N+4]} = vbslq_f32(vm${ABC[N:N+4]}, vf${ABC[N:N+4]}, vsubq_f32(vone, vf${ABC[N:N+4]})); 180 181 $for N in range(0, BATCH_TILE, 4): 182 vst1q_f32(y, vf${ABC[N:N+4]}); y += 4; 183 } 184 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { 185 const float32x4_t vx = vld1q_f32(x); x += 4; 186 187 // General structure of the algorithm: 188 // / exp(x) / (1 + exp(x)) if x <= 0 189 // f[x] := 190 // \ 1 - f[-x] if x >= 0 191 // 192 // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x), 193 // then replace result with 1 - f[-z] if x >= 0. 194 const float32x4_t vz = vabsq_f32(vx); 195 196 // Compute reduced argument n := round(-z * 2048 / log(2)). 197 // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing 198 // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction. 199 // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e. 200 // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of 201 // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result 202 // for such inputs at the very end of the algorithm. 203 float32x4_t vn = ${VMULADDQ_F32}(vmagic_bias, vz, vminus_log2e_x2048); 204 205 // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is 206 // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) = 207 // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps: 208 // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the 209 // fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0. 210 // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized 211 // number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0, 212 // and thus the adjusted exponent is not lower than -126. 213 // 214 // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent). 215 const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12); 216 217 // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048). 218 const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); 219 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); 220 const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); 221 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]); 222 float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]); 223 vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); 224 vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); 225 const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); 226 // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value. 227 const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); 228 229 // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number. 230 vn = vsubq_f32(vn, vmagic_bias); 231 232 // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048. 233 $if RR_STEPS == 1: 234 float32x4_t vt = ${VMULADDQ_F32}(vz, vn, vln2_o2048); 235 $else: 236 // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy. 237 float32x4_t vt = ${VMULADDQ_F32}(vz, vn, vln2_o2048_hi); 238 vt = ${VMULADDQ_F32}(vt, vn, vln2_o2048_lo); 239 240 // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]: 241 // P1(t) = 1 + t * c1 242 const float32x4_t vp = vmulq_f32(vt, vc1); 243 244 // Reconstruct the exp(-z) value: 245 // y = s * (1 + t * c1) 246 // = s + s * (t * c1)) 247 // = s + s * p 248 const float32x4_t vy = ${VMULADDQ_F32}(vs, vs, vp); 249 250 // Denominator of the sigmoid fraction: 1.0 + exp(-z) 251 const float32x4_t vd = vaddq_f32(vy, vone); 252 253 $if DIV_ALGO == "div": 254 // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z)) 255 float32x4_t vf = vdivq_f32(vy, vd); 256 $else: 257 // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator. 258 // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0. 259 // Thus the reciprocal of the denominator never overflows. 260 float32x4_t vr = vrecpeq_f32(vd); 261 262 $if DIV_ALGO == "nr2fma": 263 vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd)); 264 $else: 265 vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); 266 267 $if DIV_ALGO == "nr2recps": 268 vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); 269 $else: 270 vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd)); 271 272 // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z)) 273 float32x4_t vf = vmulq_f32(vy, vr); 274 275 // For inputs below denormal cutoff, replace output with +0.0f. 276 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. 277 vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff))); 278 279 // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z) 280 const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); 281 vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf)); 282 283 vst1q_f32(y, vf); y += 4; 284 } 285 if XNN_UNLIKELY(n != 0) { 286 const float32x4_t vx = vld1q_f32(x); 287 288 // General structure of the algorithm: 289 // / exp(x) / (1 + exp(x)) if x <= 0 290 // f[x] := 291 // \ 1 - f[-x] if x >= 0 292 // 293 // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x), 294 // then replace result with 1 - f[-z] if x >= 0. 295 const float32x4_t vz = vabsq_f32(vx); 296 297 // Compute reduced argument n := round(-z * 2048 / log(2)). 298 // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing 299 // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction. 300 // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e. 301 // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of 302 // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result 303 // for such inputs at the very end of the algorithm. 304 float32x4_t vn = ${VMULADDQ_F32}(vmagic_bias, vz, vminus_log2e_x2048); 305 306 // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is 307 // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) = 308 // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps: 309 // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the 310 // fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0. 311 // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized 312 // number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0, 313 // and thus the adjusted exponent is not lower than -126. 314 // 315 // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent). 316 const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12); 317 318 // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048). 319 const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); 320 const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); 321 const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); 322 float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]); 323 float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]); 324 vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); 325 vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); 326 const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); 327 // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value. 328 const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); 329 330 // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number. 331 vn = vsubq_f32(vn, vmagic_bias); 332 333 // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048. 334 $if RR_STEPS == 1: 335 float32x4_t vt = ${VMULADDQ_F32}(vz, vn, vln2_o2048); 336 $else: 337 // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy. 338 float32x4_t vt = ${VMULADDQ_F32}(vz, vn, vln2_o2048_hi); 339 vt = ${VMULADDQ_F32}(vt, vn, vln2_o2048_lo); 340 341 // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]: 342 // P1(t) = 1 + t * c1 343 const float32x4_t vp = vmulq_f32(vt, vc1); 344 345 // Reconstruct the exp(-z) value: 346 // y = s * (1 + t * c1) 347 // = s + s * (t * c1)) 348 // = s + s * p 349 const float32x4_t vy = ${VMULADDQ_F32}(vs, vs, vp); 350 351 // Denominator of the sigmoid fraction: 1.0 + exp(-z) 352 const float32x4_t vd = vaddq_f32(vy, vone); 353 354 $if DIV_ALGO == "div": 355 // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z)) 356 float32x4_t vf = vdivq_f32(vy, vd); 357 $else: 358 // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator. 359 // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0. 360 // Thus the reciprocal of the denominator never overflows. 361 float32x4_t vr = vrecpeq_f32(vd); 362 363 $if DIV_ALGO == "nr2fma": 364 vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd)); 365 $else: 366 vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); 367 368 $if DIV_ALGO == "nr2recps": 369 vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); 370 $else: 371 vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd)); 372 373 // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z)) 374 float32x4_t vf = vmulq_f32(vy, vr); 375 376 // For inputs below denormal cutoff, replace output with +0.0f. 377 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. 378 vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff))); 379 380 // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z) 381 const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); 382 vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf)); 383 384 float32x2_t vf_lo = vget_low_f32(vf); 385 if (n & (2 * sizeof(float))) { 386 vst1_f32(y, vf_lo); y += 2; 387 vf_lo = vget_high_f32(vf); 388 } 389 if (n & (1 * sizeof(float))) { 390 vst1_lane_f32(y, vf_lo, 0); 391 } 392 } 393} 394