1 // Auto-generated file. Do not edit! 2 // Template: src/f32-velu/scalar-rr2-p6.c.in 3 // Generator: tools/xngen 4 // 5 // Copyright 2020 Google LLC 6 // 7 // This source code is licensed under the BSD-style license found in the 8 // LICENSE file in the root directory of this source tree. 9 10 #include <assert.h> 11 #include <math.h> 12 13 #include <xnnpack/common.h> 14 #include <xnnpack/vunary.h> 15 16 #include <fp16/bitcasts.h> 17 18 xnn_f32_velu_ukernel__scalar_rr2_p6_x4(size_t n,const float * x,float * y,const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS (1)])19void xnn_f32_velu_ukernel__scalar_rr2_p6_x4( 20 size_t n, 21 const float* x, 22 float* y, 23 const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) 24 { 25 assert(n % sizeof(float) == 0); 26 27 const float vprescale = params->scalar.prescale; 28 const float valpha = params->scalar.alpha; 29 const float vbeta = params->scalar.beta; 30 31 const float vmagic_bias = 0x1.8000FEp23f; 32 const float vlog2e = 0x1.715476p+0f; 33 const float vsat_cutoff = -0x1.154246p+4f; 34 const float vminus_ln2_hi = -0x1.62E440p-1f; 35 const float vminus_ln2_lo = 0x1.0105C6p-21f; 36 const float vc6 = 0x1.6b7338p-10f; 37 const float vc5 = 0x1.12278Ep-7f; 38 const float vc4 = 0x1.555716p-5f; 39 const float vc3 = 0x1.5554B0p-3f; 40 const float vc2 = 0x1.FFFFFEp-2f; 41 const float vone = 1.0f; 42 43 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) { 44 float vx0 = x[0]; 45 float vx1 = x[1]; 46 float vx2 = x[2]; 47 float vx3 = x[3]; 48 x += 4; 49 50 const float vz0 = vx0 * vprescale; 51 const float vz1 = vx1 * vprescale; 52 const float vz2 = vx2 * vprescale; 53 const float vz3 = vx3 * vprescale; 54 55 float vn0 = vz0 * vlog2e + vmagic_bias; 56 float vn1 = vz1 * vlog2e + vmagic_bias; 57 float vn2 = vz2 * vlog2e + vmagic_bias; 58 float vn3 = vz3 * vlog2e + vmagic_bias; 59 60 float vs0 = fp32_from_bits(fp32_to_bits(vn0) << 23); 61 vn0 -= vmagic_bias; 62 float vs1 = fp32_from_bits(fp32_to_bits(vn1) << 23); 63 vn1 -= vmagic_bias; 64 float vs2 = fp32_from_bits(fp32_to_bits(vn2) << 23); 65 vn2 -= vmagic_bias; 66 float vs3 = fp32_from_bits(fp32_to_bits(vn3) << 23); 67 vn3 -= vmagic_bias; 68 69 float vt0 = vn0 * vminus_ln2_hi + vz0; 70 float vt1 = vn1 * vminus_ln2_hi + vz1; 71 float vt2 = vn2 * vminus_ln2_hi + vz2; 72 float vt3 = vn3 * vminus_ln2_hi + vz3; 73 74 vt0 = vn0 * vminus_ln2_lo + vt0; 75 vt1 = vn1 * vminus_ln2_lo + vt1; 76 vt2 = vn2 * vminus_ln2_lo + vt2; 77 vt3 = vn3 * vminus_ln2_lo + vt3; 78 79 if XNN_UNPREDICTABLE(vz0 <= vsat_cutoff) { 80 vs0 = 0.0f; 81 vt0 = 0.0f; 82 } 83 if XNN_UNPREDICTABLE(vz1 <= vsat_cutoff) { 84 vs1 = 0.0f; 85 vt1 = 0.0f; 86 } 87 if XNN_UNPREDICTABLE(vz2 <= vsat_cutoff) { 88 vs2 = 0.0f; 89 vt2 = 0.0f; 90 } 91 if XNN_UNPREDICTABLE(vz3 <= vsat_cutoff) { 92 vs3 = 0.0f; 93 vt3 = 0.0f; 94 } 95 96 float vp0 = vc6 * vt0 + vc5; 97 float vp1 = vc6 * vt1 + vc5; 98 float vp2 = vc6 * vt2 + vc5; 99 float vp3 = vc6 * vt3 + vc5; 100 101 vp0 = vp0 * vt0 + vc4; 102 vp1 = vp1 * vt1 + vc4; 103 vp2 = vp2 * vt2 + vc4; 104 vp3 = vp3 * vt3 + vc4; 105 106 vp0 = vp0 * vt0 + vc3; 107 vp1 = vp1 * vt1 + vc3; 108 vp2 = vp2 * vt2 + vc3; 109 vp3 = vp3 * vt3 + vc3; 110 111 vp0 = vp0 * vt0 + vc2; 112 vp1 = vp1 * vt1 + vc2; 113 vp2 = vp2 * vt2 + vc2; 114 vp3 = vp3 * vt3 + vc2; 115 116 vp0 *= vt0; 117 vp1 *= vt1; 118 vp2 *= vt2; 119 vp3 *= vt3; 120 121 vt0 *= vs0; 122 vs0 -= vone; 123 vt1 *= vs1; 124 vs1 -= vone; 125 vt2 *= vs2; 126 vs2 -= vone; 127 vt3 *= vs3; 128 vs3 -= vone; 129 130 vp0 = vp0 * vt0 + vt0; 131 vp1 = vp1 * vt1 + vt1; 132 vp2 = vp2 * vt2 + vt2; 133 vp3 = vp3 * vt3 + vt3; 134 135 const float ve0 = (vp0 + vs0) * valpha; 136 float vy0 = vx0 * vbeta; 137 const float ve1 = (vp1 + vs1) * valpha; 138 float vy1 = vx1 * vbeta; 139 const float ve2 = (vp2 + vs2) * valpha; 140 float vy2 = vx2 * vbeta; 141 const float ve3 = (vp3 + vs3) * valpha; 142 float vy3 = vx3 * vbeta; 143 144 if XNN_UNPREDICTABLE(vx0 < 0.0f) { 145 vy0 = ve0; 146 } 147 if XNN_UNPREDICTABLE(vx1 < 0.0f) { 148 vy1 = ve1; 149 } 150 if XNN_UNPREDICTABLE(vx2 < 0.0f) { 151 vy2 = ve2; 152 } 153 if XNN_UNPREDICTABLE(vx3 < 0.0f) { 154 vy3 = ve3; 155 } 156 157 y[0] = vy0; 158 y[1] = vy1; 159 y[2] = vy2; 160 y[3] = vy3; 161 y += 4; 162 } 163 if XNN_UNLIKELY(n != 0) { 164 do { 165 float vx = *x++; 166 167 const float vz = vx * vprescale; 168 169 float vn = vz * vlog2e + vmagic_bias; 170 float vs = fp32_from_bits(fp32_to_bits(vn) << 23); 171 vn -= vmagic_bias; 172 173 float vt = vn * vminus_ln2_hi + vz; 174 vt = vn * vminus_ln2_lo + vt; 175 176 if XNN_UNPREDICTABLE(vz <= vsat_cutoff) { 177 vs = 0.0f; 178 vt = 0.0f; 179 } 180 181 float vp = vc6 * vt + vc5; 182 vp = vp * vt + vc4; 183 vp = vp * vt + vc3; 184 vp = vp * vt + vc2; 185 vp *= vt; 186 187 vt *= vs; 188 vs -= vone; 189 vp = vp * vt + vt; 190 const float ve = (vp + vs) * valpha; 191 192 float vy = vx * vbeta; 193 if XNN_UNPREDICTABLE(vx < 0.0f) { 194 vy = ve; 195 } 196 197 *y++ = vy; 198 199 n -= sizeof(float); 200 } while (n != 0); 201 } 202 } 203