1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert ELEMENTS_TILE >= 1 7$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 8#include <assert.h> 9 10#include <xnnpack/common.h> 11#include <xnnpack/raddstoreexpminusmax.h> 12 13#include <fp16/bitcasts.h> 14 15 16void xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x${ELEMENTS_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( 17 size_t elements, 18 const float* input, 19 float* output, 20 float* sum, 21 float vi_max) 22{ 23 assert(elements % sizeof(float) == 0); 24 25 const float vmagic_bias = 0x1.8000FEp23f; 26 // The smallest x for which expf(x) is normalized. 27 const float vdenorm_cutoff = -0x1.5D589Ep6f; 28 const float vlog2e = 0x1.715476p+0f; 29 // Last 7 bits are zeroes 30 const float vminus_ln2_hi = -0x1.62E400p-1f; 31 const float vminus_ln2_lo = -0x1.7F7D1Cp-20f; 32 33 const float vc1 = 0x1.FFFFF6p-1f; 34 const float vc2 = 0x1.FFFDC6p-2f; 35 const float vc3 = 0x1.555A80p-3f; 36 const float vc4 = 0x1.573A1Ap-5f; 37 const float vc5 = 0x1.0F9F9Cp-7f; 38 39 $if ELEMENTS_TILE > 1: 40 $for K in range(ACCUMULATORS): 41 float vacc${K} = 0.0f; 42 for (; elements >= ${ELEMENTS_TILE} * sizeof(float); elements -= ${ELEMENTS_TILE} * sizeof(float)) { 43 // Load ${ELEMENTS_TILE} inputs at a time. 44 $for N in range(ELEMENTS_TILE): 45 const float vi${N} = input[${N}]; 46 input += ${ELEMENTS_TILE}; 47 48 // Subtract maximum input x := i - i_max. This implies x <= 0. 49 $for N in range(ELEMENTS_TILE): 50 const float vx${N} = vi${N} - vi_max; 51 52 // Compute reduced argument n := round(x / log(2)). 53 // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result 54 // to an integer, then subtracing the large number back. The trick with adding large number is valid only within 55 // certain bounds (|x| <= 2**22), but thats ok, because inputs outside of [-87.336540, 0.0] underflow expf(x) 56 // anyway. We fixup the result for such inputs at the very end of the algorithm. 57 $for N in range(ELEMENTS_TILE): 58 float vn${N} = vx${N} * vlog2e + vmagic_bias; 59 60 // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e. 61 // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly. 62 $for N in range(ELEMENTS_TILE): 63 const float vs${N} = fp32_from_bits(fp32_to_bits(vn${N}) << 23); 64 65 // Subtract the large number back to get final n := round(x / log(2)). 66 $for N in range(ELEMENTS_TILE): 67 vn${N} -= vmagic_bias; 68 69 // Compute reduced argument t := x - n * log(2). 70 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. 71 $for N in range(ELEMENTS_TILE): 72 float vt${N} = vn${N} * vminus_ln2_hi + vx${N}; 73 74 $for N in range(ELEMENTS_TILE): 75 vt${N} = vn${N} * vminus_ln2_lo + vt${N}; 76 77 // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. 78 $for N in range(ELEMENTS_TILE): 79 float vp${N} = vc5 * vt${N} + vc4; 80 81 $for N in range(ELEMENTS_TILE): 82 vp${N} = vp${N} * vt${N} + vc3; 83 84 $for N in range(ELEMENTS_TILE): 85 vp${N} = vp${N} * vt${N} + vc2; 86 87 $for N in range(ELEMENTS_TILE): 88 vp${N} = vp${N} * vt${N} + vc1; 89 90 // Reconstruct the final f value: 91 // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) 92 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) 93 // = s + (t * s) * p 94 $for N in range(ELEMENTS_TILE): 95 vt${N} *= vs${N}; 96 97 $for N in range(ELEMENTS_TILE): 98 float vf${N} = vt${N} * vp${N} + vs${N}; 99 100 // For inputs below denormal cutoff, replace output with +0.0f. 101 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. 102 $for N in range(ELEMENTS_TILE): 103 if XNN_UNPREDICTABLE(vx${N} < vdenorm_cutoff) { 104 vf${N} = 0.0f; 105 } 106 107 // Store ${ELEMENTS_TILE} outputs at a time. 108 $for N in range(ELEMENTS_TILE): 109 output[${N}] = vf${N}; 110 output += ${ELEMENTS_TILE}; 111 112 // Accumulate computed exponents. 113 $for N in range(ELEMENTS_TILE): 114 vacc${N % ACCUMULATORS} += vf${N}; 115 } 116 $if ACCUMULATORS > 1: 117 // Add up all accumulators to vacc0 118 $ACC_SLICE = 1 119 $while ACC_SLICE < ACCUMULATORS: 120 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 121 $if A + ACC_SLICE < ACCUMULATORS: 122 vacc${A} += vacc${A + ACC_SLICE}; 123 $ACC_SLICE *= 2 124 125 float vacc = vacc0; 126 $else: 127 float vacc = 0.0f; 128 for (; elements >= sizeof(float); elements -= sizeof(float)) { 129 // Load 1 input at a time. 130 const float vi = *input++; 131 132 // Subtract maximum input x := i - i_max. This implies x <= 0. 133 const float vx = vi - vi_max; 134 135 // Compute reduced argument n := round(x / log(2)). 136 // We do it by adding a large number (magic bias) to the product x * (1/log(2)), which cause rounding of the result 137 // to an integer, then subtracing the large number back. The trick with adding large number is valid only within 138 // certain bounds (|x| <= 2**22), but thats ok, because inputs outside of [-87.336540, 0.0] underflow expf(x) 139 // anyway. We fixup the result for such inputs at the very end of the algorithm. 140 float vn = vx * vlog2e + vmagic_bias; 141 142 // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e. 143 // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly. 144 const float vs = fp32_from_bits(fp32_to_bits(vn) << 23); 145 146 // Subtract the large number back to get final n := round(x / log(2)). 147 vn -= vmagic_bias; 148 149 // Compute reduced argument t := x - n * log(2). 150 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy. 151 float vt = vn * vminus_ln2_hi + vx; 152 vt = vn * vminus_ln2_lo + vt; 153 154 // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2]. 155 float vp = vc5 * vt + vc4; 156 vp = vp * vt + vc3; 157 vp = vp * vt + vc2; 158 vp = vp * vt + vc1; 159 160 // Reconstruct the final f value: 161 // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))) 162 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))) 163 // = s + (t * s) * p 164 vt *= vs; 165 float vf = vt * vp + vs; 166 167 // For inputs below denormal cutoff, replace output with +0.0f. 168 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged. 169 if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) { 170 vf = 0.0f; 171 } 172 173 // Store 1 output at a time. 174 *output++ = vf; 175 176 // Accumulate computed exponents. 177 vacc += vf; 178 } 179 *sum = vacc; 180} 181