1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert BATCH_TILE % 4 == 0 7$assert BATCH_TILE >= 4 8$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 9#include <assert.h> 10 11$if BLEND: 12 #include <smmintrin.h> 13$else: 14 #include <emmintrin.h> 15 16#include <xnnpack/common.h> 17#include <xnnpack/vunary.h> 18 19 20void xnn_f32_sigmoid_ukernel__${"sse41" if BLEND else "sse2"}_p5_div_x${BATCH_TILE}( 21 size_t n, 22 const float* x, 23 float* y, 24 const void* params) 25{ 26 assert(n % sizeof(float) == 0); 27 28 const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f); 29 // The smallest x for which sigmoidf(x) is normalized. 30 // This number is also the smallest x for which expf(x) is normalized. 31 const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f); 32 const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f); 33 // Last 7 bits are zeroes 34 const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f); 35 const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f); 36 const __m128 vone = _mm_set1_ps(1.0f); 37 const __m128 vsign_mask = _mm_set1_ps(-0.0f); 38 39 const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f); 40 const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f); 41 const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f); 42 const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f); 43 const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f); 44 45 $if BATCH_TILE > 4: 46 for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) { 47 const __m128 vx${ABC[0:4]} = _mm_loadu_ps(x); 48 $for N in range(4, BATCH_TILE, 4): 49 const __m128 vx${ABC[N:N+4]} = _mm_loadu_ps(x + ${N}); 50 51 // General structure of the algorithm: 52 // / exp(x) / (1 + exp(x)) if x <= 0 53 // f[x] := 54 // \ 1 - f[-x] if x >= 0 55 // 56 // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x), 57 // then replace result with 1 - f[z] if x >= 0. 58 $for N in range(0, BATCH_TILE, 4): 59 const __m128 vz${ABC[N:N+4]} = _mm_or_ps(vx${ABC[N:N+4]}, vsign_mask); 60 61 // Compute reduced argument n := round(z / log(2)). 62 // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result 63 // to an integer, then subtracing the large number back. The trick with adding large number is valid only within 64 // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize 65 // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of 66 // the algorithm. 67 $for N in range(0, BATCH_TILE, 4): 68 __m128 vn${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vz${ABC[N:N+4]}, vlog2e), vmagic_bias); 69 70 // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e. 71 // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly. 72 $for N in range(0, BATCH_TILE, 4): 73 const __m128 vs${ABC[N:N+4]} = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn${ABC[N:N+4]}), 23)); 74 75 // Subtract the large number back to get final n := round(z / log(2)). 76 $for N in range(0, BATCH_TILE, 4): 77 vn${ABC[N:N+4]} = _mm_sub_ps(vn${ABC[N:N+4]}, vmagic_bias); 78 79 // Compute reduced argument t := z - n * log(2). 80 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. 81 $for N in range(0, BATCH_TILE, 4): 82 __m128 vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_hi), vz${ABC[N:N+4]}); 83 84 $for N in range(0, BATCH_TILE, 4): 85 vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_lo), vt${ABC[N:N+4]}); 86 87 // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2]. 88 $for N in range(0, BATCH_TILE, 4): 89 __m128 vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vc5, vt${ABC[N:N+4]}), vc4); 90 91 $for N in range(0, BATCH_TILE, 4): 92 vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc3); 93 94 $for N in range(0, BATCH_TILE, 4): 95 vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc2); 96 97 $for N in range(0, BATCH_TILE, 4): 98 vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc1); 99 100 // Reconstruct the exp(z) value: 101 // e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) 102 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) 103 // = s + (t * s) * p 104 $for N in range(0, BATCH_TILE, 4): 105 vt${ABC[N:N+4]} = _mm_mul_ps(vt${ABC[N:N+4]}, vs${ABC[N:N+4]}); 106 107 $for N in range(0, BATCH_TILE, 4): 108 __m128 ve${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vt${ABC[N:N+4]}, vp${ABC[N:N+4]}), vs${ABC[N:N+4]}); 109 110 // Denominator of the sigmoid fraction: 1.0 + exp(z) 111 $for N in range(0, BATCH_TILE, 4): 112 __m128 vd${ABC[N:N+4]} = _mm_add_ps(ve${ABC[N:N+4]}, vone); 113 114 // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z)) 115 $for N in range(0, BATCH_TILE, 4): 116 __m128 vf${ABC[N:N+4]} = _mm_div_ps(ve${ABC[N:N+4]}, vd${ABC[N:N+4]}); 117 118 // For inputs below denormal cutoff, replace output with +0.0f. 119 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. 120 $for N in range(0, BATCH_TILE, 4): 121 vf${ABC[N:N+4]} = _mm_andnot_ps(_mm_cmplt_ps(vz${ABC[N:N+4]}, vdenorm_cutoff), vf${ABC[N:N+4]}); 122 123 // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z) 124 $if BLEND: 125 $for N in range(0, BATCH_TILE, 4): 126 vf${ABC[N:N+4]} = _mm_blendv_ps(_mm_sub_ps(vone, vf${ABC[N:N+4]}), vf${ABC[N:N+4]}, vx${ABC[N:N+4]}); 127 $else: 128 $for N in range(0, BATCH_TILE, 4): 129 __m128 vm${ABC[N:N+4]} = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx${ABC[N:N+4]}))); 130 131 $for N in range(0, BATCH_TILE, 4): 132 vf${ABC[N:N+4]} = _mm_or_ps(_mm_and_ps(vf${ABC[N:N+4]}, vm${ABC[N:N+4]}), _mm_andnot_ps(vm${ABC[N:N+4]}, _mm_sub_ps(vone, vf${ABC[N:N+4]}))); 133 134 _mm_storeu_ps(y, vf${ABC[0:4]}); 135 $for N in range(4, BATCH_TILE, 4): 136 _mm_storeu_ps(y + ${N}, vf${ABC[N:N+4]}); 137 138 x += ${BATCH_TILE}; 139 y += ${BATCH_TILE}; 140 } 141 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { 142 const __m128 vx = _mm_loadu_ps(x); 143 144 // General structure of the algorithm: 145 // / exp(x) / (1 + exp(x)) if x <= 0 146 // f[x] := 147 // \ 1 - f[-x] if x >= 0 148 // 149 // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x), 150 // then replace result with 1 - f[z] if x >= 0. 151 const __m128 vz = _mm_or_ps(vx, vsign_mask); 152 153 // Compute reduced argument n := round(z / log(2)). 154 // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result 155 // to an integer, then subtracing the large number back. The trick with adding large number is valid only within 156 // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize 157 // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of 158 // the algorithm. 159 __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias); 160 161 // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e. 162 // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly. 163 const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); 164 165 // Subtract the large number back to get final n := round(z / log(2)). 166 vn = _mm_sub_ps(vn, vmagic_bias); 167 168 // Compute reduced argument t := z - n * log(2). 169 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. 170 __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz); 171 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); 172 173 // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2]. 174 __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); 175 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); 176 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); 177 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); 178 179 // Reconstruct the exp(z) value: 180 // e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) 181 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) 182 // = s + (t * s) * p 183 vt = _mm_mul_ps(vt, vs); 184 __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs); 185 186 // Denominator of the sigmoid fraction: 1.0 + exp(z) 187 __m128 vd = _mm_add_ps(ve, vone); 188 189 // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z)) 190 __m128 vf = _mm_div_ps(ve, vd); 191 192 // For inputs below denormal cutoff, replace output with +0.0f. 193 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. 194 vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf); 195 196 // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z) 197 $if BLEND: 198 vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx); 199 $else: 200 __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx))); 201 vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf))); 202 203 _mm_storeu_ps(y, vf); 204 205 x += 4; 206 y += 4; 207 } 208 if XNN_UNLIKELY(n != 0) { 209 const __m128 vx = _mm_loadu_ps(x); 210 211 // General structure of the algorithm: 212 // / exp(x) / (1 + exp(x)) if x <= 0 213 // f[x] := 214 // \ 1 - f[-x] if x >= 0 215 // 216 // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x), 217 // then replace result with 1 - f[z] if x >= 0. 218 const __m128 vz = _mm_or_ps(vx, vsign_mask); 219 220 // Compute reduced argument n := round(z / log(2)). 221 // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result 222 // to an integer, then subtracing the large number back. The trick with adding large number is valid only within 223 // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize 224 // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of 225 // the algorithm. 226 __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias); 227 228 // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e. 229 // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly. 230 const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23)); 231 232 // Subtract the large number back to get final n := round(z / log(2)). 233 vn = _mm_sub_ps(vn, vmagic_bias); 234 235 // Compute reduced argument t := z - n * log(2). 236 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. 237 __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz); 238 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt); 239 240 // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2]. 241 __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4); 242 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3); 243 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2); 244 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1); 245 246 // Reconstruct the exp(z) value: 247 // e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) 248 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) 249 // = s + (t * s) * p 250 vt = _mm_mul_ps(vt, vs); 251 __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs); 252 253 // Denominator of the sigmoid fraction: 1.0 + exp(z) 254 __m128 vd = _mm_add_ps(ve, vone); 255 256 // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z)) 257 __m128 vf = _mm_div_ps(ve, vd); 258 259 // For inputs below denormal cutoff, replace output with +0.0f. 260 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. 261 vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf); 262 263 // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z) 264 $if BLEND: 265 vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx); 266 $else: 267 __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx))); 268 vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf))); 269 270 if (n & (2 * sizeof(float))) { 271 _mm_storel_pi((__m64*) y, vf); 272 vf = _mm_movehl_ps(vf, vf); 273 y += 2; 274 } 275 if (n & (1 * sizeof(float))) { 276 _mm_store_ss(y, vf); 277 } 278 } 279} 280