1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert DATATYPE in ["QS8", "QU8"] 7$assert CHANNEL_TILE % 8 == 0 8$assert CHANNEL_TILE >= 8 9$assert ROW_TILE >= 3 10$assert ROW_SUBTILE >= 3 11$assert ROW_SUBTILE <= ROW_TILE 12$assert REQUANTIZATION == "FP32" 13$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 14#include <assert.h> 15 16#include <smmintrin.h> 17 18#include <xnnpack/gavgpool.h> 19#include <xnnpack/math.h> 20#include <xnnpack/unaligned.h> 21 22 23$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" 24$_MM_CVTEPX8_EPI16 = {"QS8": "_mm_cvtepi8_epi16", "QU8": "_mm_cvtepu8_epi16"}[DATATYPE] 25$_MM_CVTEPX16_EPI32 = {"QS8": "_mm_cvtepi16_epi32", "QU8": "_mm_cvtepu16_epi32"}[DATATYPE] 26$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] 27$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE] 28void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__sse41_c${CHANNEL_TILE}( 29 size_t rows, 30 size_t channels, 31 const ${XINT8_T}* input, 32 size_t input_stride, 33 const ${XINT8_T}* zero, 34 int32_t* buffer, 35 ${XINT8_T}* output, 36 const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 37{ 38 assert(rows > ${ROW_TILE}); 39 assert(channels != 0); 40 41 const ${XINT8_T}* i0 = input; 42 $for M in range(1, ROW_TILE): 43 const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); 44 $if CHANNEL_TILE <= 16: 45 const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T}); 46 $else: 47 const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T}); 48 49 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); 50 int32_t* b = buffer; 51 size_t c = channels; 52 for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { 53 $for M in range(2): 54 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 55 $for C in range(8, CHANNEL_TILE, 8): 56 const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); 57 i${M} += ${CHANNEL_TILE}; 58 59 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 60 const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 61 $for C in range(8, CHANNEL_TILE, 8): 62 __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); 63 const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C}))); 64 i2 += ${CHANNEL_TILE}; 65 66 $for M in range(3, ROW_TILE): 67 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 68 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 69 $for C in range(8, CHANNEL_TILE, 8): 70 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); 71 const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); 72 i${M} += ${CHANNEL_TILE}; 73 74 $for C in range(0, CHANNEL_TILE, 8): 75 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_TILE-1}x${ABC[C:C+8]}); 76 77 $if DATATYPE == "QU8": 78 const __m128i vzero = _mm_setzero_si128(); 79 $for C in range(0, CHANNEL_TILE, 8): 80 __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]}); 81 $if DATATYPE == "QS8": 82 __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16); 83 $else: 84 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); 85 86 $for C in range(0, CHANNEL_TILE, 4): 87 vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, vinit_bias); 88 89 _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); 90 $for C in range(4, CHANNEL_TILE, 4): 91 _mm_store_si128((__m128i*) (b + ${C}), vacc${ABC[C:C+4]}); 92 b += ${CHANNEL_TILE}; 93 } 94 $if CHANNEL_TILE > 16: 95 if XNN_UNLIKELY(c != 0) { 96 do { 97 $for M in range(2): 98 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 99 i${M} += 8; 100 101 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 102 const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 103 i2 += 8; 104 105 $for M in range(3, ROW_TILE): 106 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 107 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 108 i${M} += 8; 109 110 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_TILE-1}x${ABC[0:8]}); 111 112 __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]}); 113 $if DATATYPE == "QS8": 114 __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16); 115 $else: 116 __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128()); 117 118 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, vinit_bias); 119 vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, vinit_bias); 120 121 _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); 122 _mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]}); 123 b += 8; 124 125 c = doz(c, 8); 126 } while (c != 0); 127 } 128 129 for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { 130 $for M in range(ROW_SUBTILE): 131 i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 132 133 int32_t* b = buffer; 134 size_t c = channels; 135 for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { 136 $for M in range(2): 137 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 138 $for C in range(8, CHANNEL_TILE, 8): 139 const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); 140 i${M} += ${CHANNEL_TILE}; 141 142 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 143 const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 144 $for C in range(8, CHANNEL_TILE, 8): 145 __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); 146 const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C}))); 147 i2 += ${CHANNEL_TILE}; 148 149 $for M in range(3, ROW_SUBTILE): 150 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 151 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 152 $for C in range(8, CHANNEL_TILE, 8): 153 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); 154 const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); 155 i${M} += ${CHANNEL_TILE}; 156 157 $for C in range(0, CHANNEL_TILE, 8): 158 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]}); 159 160 $if DATATYPE == "QU8": 161 const __m128i vzero = _mm_setzero_si128(); 162 $for C in range(0, CHANNEL_TILE, 8): 163 __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]}); 164 $if DATATYPE == "QS8": 165 __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16); 166 $else: 167 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); 168 169 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b)); 170 $for C in range(4, CHANNEL_TILE, 4): 171 vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (b + ${C}))); 172 173 _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); 174 $for C in range(4, CHANNEL_TILE, 4): 175 _mm_store_si128((__m128i*) (b + ${C}), vacc${ABC[C:C+4]}); 176 b += ${CHANNEL_TILE}; 177 } 178 $if CHANNEL_TILE > 16: 179 if XNN_UNLIKELY(c != 0) { 180 do { 181 $for M in range(2): 182 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 183 i${M} += 8; 184 185 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 186 const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 187 i2 += 8; 188 189 $for M in range(3, ROW_SUBTILE): 190 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 191 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 192 i${M} += 8; 193 194 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]}); 195 196 __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]}); 197 $if DATATYPE == "QS8": 198 __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16); 199 $else: 200 __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128()); 201 202 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b)); 203 vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (b + 4))); 204 205 _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); 206 _mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]}); 207 b += 8; 208 209 c = doz(c, 8); 210 } while (c != 0); 211 } 212 } 213 214 i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); 215 $for M in range(1, ROW_SUBTILE): 216 i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 217 $if M % 2 == 1: 218 if XNN_UNPREDICTABLE(rows < ${M+1}) { 219 i${M} = zero; 220 } 221 $else: 222 if XNN_UNPREDICTABLE(rows <= ${M}) { 223 i${M} = zero; 224 } 225 226 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); 227 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); 228 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); 229 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); 230 for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { 231 $for M in range(2): 232 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 233 $for C in range(8, CHANNEL_TILE, 8): 234 const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); 235 i${M} += ${CHANNEL_TILE}; 236 237 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 238 const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 239 $for C in range(8, CHANNEL_TILE, 8): 240 __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); 241 const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C}))); 242 i2 += ${CHANNEL_TILE}; 243 244 $for M in range(3, ROW_SUBTILE): 245 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 246 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 247 $for C in range(8, CHANNEL_TILE, 8): 248 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); 249 const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); 250 i${M} += ${CHANNEL_TILE}; 251 252 $for C in range(0, CHANNEL_TILE, 8): 253 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]}); 254 255 $if DATATYPE == "QU8": 256 const __m128i vzero = _mm_setzero_si128(); 257 $for C in range(0, CHANNEL_TILE, 8): 258 __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]}); 259 $if DATATYPE == "QS8": 260 __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16); 261 $else: 262 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); 263 264 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer)); 265 $for C in range(4, CHANNEL_TILE, 4): 266 vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (buffer + ${C}))); 267 buffer += ${CHANNEL_TILE}; 268 269 $for C in range(0, CHANNEL_TILE, 4): 270 __m128 vfpacc${ABC[C:C+4]} = _mm_cvtepi32_ps(vacc${ABC[C:C+4]}); 271 272 $for C in range(0, CHANNEL_TILE, 4): 273 vfpacc${ABC[C:C+4]} = _mm_mul_ps(vfpacc${ABC[C:C+4]}, vscale); 274 275 $for C in range(0, CHANNEL_TILE, 4): 276 vfpacc${ABC[C:C+4]} = _mm_min_ps(vfpacc${ABC[C:C+4]}, voutput_max_less_zero_point); 277 278 $for C in range(0, CHANNEL_TILE, 4): 279 vacc${ABC[C:C+4]} = _mm_cvtps_epi32(vfpacc${ABC[C:C+4]}); 280 281 $for C in range(0, CHANNEL_TILE, 8): 282 __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point); 283 284 $for C in range(0, CHANNEL_TILE, 16): 285 $if C + 8 < CHANNEL_TILE: 286 __m128i vout${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); 287 $else: 288 __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); 289 290 $for C in range(0, CHANNEL_TILE, 16): 291 $if C + 8 < CHANNEL_TILE: 292 vout${ABC[C:C+16]} = ${_MM_MAX_EPX8}(vout${ABC[C:C+16]}, voutput_min); 293 $else: 294 vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_MAX_EPX8}(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min); 295 296 $if CHANNEL_TILE > 8: 297 _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); 298 $else: 299 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); 300 $for C in range(16, CHANNEL_TILE, 16): 301 $if C + 8 < CHANNEL_TILE: 302 _mm_storeu_si128((__m128i*) (output + ${C}), vout${ABC[C:C+16]}); 303 $else: 304 _mm_storel_epi64((__m128i*) (output + ${C}), vout${ABC[C:C+8]}${ABC[C:C+8]}); 305 output += ${CHANNEL_TILE}; 306 } 307 if XNN_UNLIKELY(channels != 0) { 308 ${"do " if CHANNEL_TILE > 8 else ""}{ 309 $for M in range(2): 310 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 311 i${M} += 8; 312 313 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 314 const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 315 i2 += 8; 316 317 $for M in range(3, ROW_SUBTILE): 318 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 319 const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 320 i${M} += 8; 321 322 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]}); 323 324 __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]}); 325 $if DATATYPE == "QS8": 326 __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16); 327 $else: 328 __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128()); 329 330 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer)); 331 vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (buffer + 4))); 332 buffer += 8; 333 334 __m128 vfpacc${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]}); 335 __m128 vfpacc${ABC[4:8]} = _mm_cvtepi32_ps(vacc${ABC[4:8]}); 336 337 vfpacc${ABC[0:4]} = _mm_mul_ps(vfpacc${ABC[0:4]}, vscale); 338 vfpacc${ABC[4:8]} = _mm_mul_ps(vfpacc${ABC[4:8]}, vscale); 339 340 vfpacc${ABC[0:4]} = _mm_min_ps(vfpacc${ABC[0:4]}, voutput_max_less_zero_point); 341 vfpacc${ABC[4:8]} = _mm_min_ps(vfpacc${ABC[4:8]}, voutput_max_less_zero_point); 342 343 vacc${ABC[0:4]} = _mm_cvtps_epi32(vfpacc${ABC[0:4]}); 344 vacc${ABC[4:8]} = _mm_cvtps_epi32(vfpacc${ABC[4:8]}); 345 346 __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point); 347 348 __m128i vout${ABC[0:8]}${ABC[0:8]} = ${_MM_PACKXS_EPI16}(vout${ABC[0:8]}, vout${ABC[0:8]}); 349 vout${ABC[0:8]}${ABC[0:8]} = ${_MM_MAX_EPX8}(vout${ABC[0:8]}${ABC[0:8]}, voutput_min); 350 351 $if CHANNEL_TILE > 8: 352 if XNN_LIKELY(channels >= 8) { 353 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); 354 output += 8; 355 channels -= 8; 356 } else { 357 if (channels & 4) { 358 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); 359 vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); 360 output += 4; 361 } 362 if (channels & 2) { 363 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0)); 364 vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16); 365 output += 2; 366 } 367 if (channels & 1) { 368 *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0); 369 output += 1; 370 } 371 channels = 0; 372 } 373 $else: 374 if (channels & 4) { 375 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); 376 vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); 377 output += 4; 378 } 379 if (channels & 2) { 380 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0)); 381 vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16); 382 output += 2; 383 } 384 if (channels & 1) { 385 *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0); 386 } 387 }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} 388 } 389} 390