1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert DATATYPE in ["QS8", "QU8"] 7$assert CHANNEL_TILE % 8 == 0 8$assert CHANNEL_TILE >= 8 9$assert ROW_TILE >= 3 10$assert ROW_SUBTILE >= 3 11$assert ROW_SUBTILE <= ROW_TILE 12$assert REQUANTIZATION == "FP32" 13$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 14#include <assert.h> 15 16#include <emmintrin.h> 17 18#include <xnnpack/gavgpool.h> 19#include <xnnpack/math.h> 20#include <xnnpack/unaligned.h> 21 22 23$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" 24$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] 25void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__sse2_c${CHANNEL_TILE}( 26 size_t rows, 27 size_t channels, 28 const ${XINT8_T}* input, 29 size_t input_stride, 30 const ${XINT8_T}* zero, 31 int32_t* buffer, 32 ${XINT8_T}* output, 33 const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 34{ 35 assert(rows > ${ROW_TILE}); 36 assert(channels != 0); 37 38 const ${XINT8_T}* i0 = input; 39 $for M in range(1, ROW_TILE): 40 const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); 41 $if CHANNEL_TILE <= 16: 42 const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T}); 43 $else: 44 const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T}); 45 46 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); 47 $if DATATYPE == "QU8": 48 const __m128i vzero = _mm_setzero_si128(); 49 int32_t* b = buffer; 50 size_t c = channels; 51 for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { 52 $for M in range(ROW_TILE + 2): 53 54 $for C in range(0, CHANNEL_TILE, 8): 55 $if M == 3: 56 __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi${M-3}x${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); 57 $elif M > 3: 58 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); 59 $if 1 <= M <= ROW_TILE: 60 $if DATATYPE == "QS8": 61 const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8); 62 $else: 63 const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero); 64 $if M < ROW_TILE: 65 $if C == 0: 66 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); 67 $else: 68 const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); 69 $if M < ROW_TILE: 70 i${M} += ${CHANNEL_TILE}; 71 72 $for C in range(0, CHANNEL_TILE, 8): 73 $if DATATYPE == "QS8": 74 const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]}); 75 __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); 76 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); 77 $else: 78 __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero); 79 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); 80 81 $for C in range(0, CHANNEL_TILE, 4): 82 vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, vinit_bias); 83 84 _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); 85 $for C in range(4, CHANNEL_TILE, 4): 86 _mm_store_si128((__m128i*) (b + ${C}), vacc${ABC[C:C+4]}); 87 b += ${CHANNEL_TILE}; 88 } 89 $if CHANNEL_TILE > 16: 90 if XNN_UNLIKELY(c != 0) { 91 do { 92 $for M in range(ROW_TILE + 3): 93 94 $if M == 4: 95 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi${M-4}x${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); 96 $elif M > 4: 97 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); 98 $if 2 <= M <= ROW_TILE + 1: 99 $if DATATYPE == "QS8": 100 const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8); 101 $else: 102 const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero); 103 $if M < ROW_TILE: 104 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); 105 i${M} += 8; 106 107 $if DATATYPE == "QS8": 108 const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]}); 109 __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); 110 __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); 111 $else: 112 __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero); 113 __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero); 114 115 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, vinit_bias); 116 vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, vinit_bias); 117 118 _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); 119 _mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]}); 120 b += 8; 121 122 c = doz(c, 8); 123 } while (c != 0); 124 } 125 126 for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { 127 $for M in range(ROW_SUBTILE): 128 i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 129 130 int32_t* b = buffer; 131 size_t c = channels; 132 for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { 133 $for M in range(ROW_SUBTILE + 2): 134 135 $for C in range(0, CHANNEL_TILE, 8): 136 $if M == 3: 137 __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi${M-3}x${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); 138 $elif M > 3: 139 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); 140 $if 1 <= M <= ROW_SUBTILE: 141 $if DATATYPE == "QS8": 142 const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8); 143 $else: 144 const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero); 145 $if M < ROW_SUBTILE: 146 $if C == 0: 147 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); 148 $else: 149 const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); 150 $if M < ROW_SUBTILE: 151 i${M} += ${CHANNEL_TILE}; 152 153 $for C in range(0, CHANNEL_TILE, 8): 154 $if DATATYPE == "QS8": 155 const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]}); 156 __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); 157 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); 158 $else: 159 __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero); 160 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); 161 162 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b)); 163 $for C in range(4, CHANNEL_TILE, 4): 164 vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (b + ${C}))); 165 166 _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); 167 $for C in range(4, CHANNEL_TILE, 4): 168 _mm_store_si128((__m128i*) (b + ${C}), vacc${ABC[C:C+4]}); 169 b += ${CHANNEL_TILE}; 170 } 171 $if CHANNEL_TILE > 16: 172 if XNN_UNLIKELY(c != 0) { 173 do { 174 $for M in range(ROW_SUBTILE + 3): 175 176 $if M == 4: 177 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi${M-4}x${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); 178 $elif M > 4: 179 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); 180 $if 2 <= M <= ROW_SUBTILE + 1: 181 $if DATATYPE == "QS8": 182 const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8); 183 $else: 184 const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero); 185 $if M < ROW_SUBTILE: 186 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); 187 i${M} += 8; 188 189 $if DATATYPE == "QS8": 190 const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]}); 191 __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); 192 __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); 193 $else: 194 __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero); 195 __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero); 196 197 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b)); 198 vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (b + 4))); 199 200 _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); 201 _mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]}); 202 b += 8; 203 204 c = doz(c, 8); 205 } while (c != 0); 206 } 207 } 208 209 i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); 210 $for M in range(1, ROW_SUBTILE): 211 i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 212 $if M % 2 == 1: 213 if XNN_UNPREDICTABLE(rows < ${M+1}) { 214 i${M} = zero; 215 } 216 $else: 217 if XNN_UNPREDICTABLE(rows <= ${M}) { 218 i${M} = zero; 219 } 220 221 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); 222 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); 223 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); 224 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); 225 for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { 226 $for M in range(ROW_SUBTILE + 2): 227 228 $for C in range(0, CHANNEL_TILE, 8): 229 $if M == 3: 230 __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi${M-3}x${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); 231 $elif M > 3: 232 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); 233 $if 1 <= M <= ROW_SUBTILE: 234 $if DATATYPE == "QS8": 235 const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8); 236 $else: 237 const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero); 238 $if M < ROW_SUBTILE: 239 $if C == 0: 240 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); 241 $else: 242 const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); 243 $if M < ROW_SUBTILE: 244 i${M} += ${CHANNEL_TILE}; 245 246 $for C in range(0, CHANNEL_TILE, 8): 247 $if DATATYPE == "QS8": 248 const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]}); 249 __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); 250 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); 251 $else: 252 __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero); 253 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); 254 255 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer)); 256 $for C in range(4, CHANNEL_TILE, 4): 257 vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (buffer + ${C}))); 258 buffer += ${CHANNEL_TILE}; 259 260 $for C in range(0, CHANNEL_TILE, 4): 261 __m128 vfpacc${ABC[C:C+4]} = _mm_cvtepi32_ps(vacc${ABC[C:C+4]}); 262 263 $for C in range(0, CHANNEL_TILE, 4): 264 vfpacc${ABC[C:C+4]} = _mm_mul_ps(vfpacc${ABC[C:C+4]}, vscale); 265 266 $for C in range(0, CHANNEL_TILE, 4): 267 vfpacc${ABC[C:C+4]} = _mm_min_ps(vfpacc${ABC[C:C+4]}, voutput_max_less_zero_point); 268 269 $for C in range(0, CHANNEL_TILE, 4): 270 vacc${ABC[C:C+4]} = _mm_cvtps_epi32(vfpacc${ABC[C:C+4]}); 271 272 $for C in range(0, CHANNEL_TILE, 8): 273 __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point); 274 275 $if DATATYPE == "QS8": 276 $for C in range(0, CHANNEL_TILE, 8): 277 vout${ABC[C:C+8]} = _mm_max_epi16(vout${ABC[C:C+8]}, voutput_min); 278 279 $for C in range(0, CHANNEL_TILE, 16): 280 $if C + 8 < CHANNEL_TILE: 281 __m128i vout${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); 282 $else: 283 __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); 284 285 $if DATATYPE == "QU8": 286 $for C in range(0, CHANNEL_TILE, 16): 287 $if C + 8 < CHANNEL_TILE: 288 vout${ABC[C:C+16]} = _mm_max_epu8(vout${ABC[C:C+16]}, voutput_min); 289 $else: 290 vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epu8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min); 291 292 $if CHANNEL_TILE > 8: 293 _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); 294 $else: 295 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); 296 $for C in range(16, CHANNEL_TILE, 16): 297 $if C + 8 < CHANNEL_TILE: 298 _mm_storeu_si128((__m128i*) (output + ${C}), vout${ABC[C:C+16]}); 299 $else: 300 _mm_storel_epi64((__m128i*) (output + ${C}), vout${ABC[C:C+8]}${ABC[C:C+8]}); 301 output += ${CHANNEL_TILE}; 302 } 303 if XNN_UNLIKELY(channels != 0) { 304 ${"do " if CHANNEL_TILE > 8 else ""}{ 305 $for M in range(ROW_SUBTILE + 3): 306 307 $if M == 4: 308 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi${M-4}x${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); 309 $elif M > 4: 310 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); 311 $if 2 <= M <= ROW_SUBTILE + 1: 312 $if DATATYPE == "QS8": 313 const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8); 314 $else: 315 const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero); 316 $if M < ROW_SUBTILE: 317 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); 318 i${M} += 8; 319 320 $if DATATYPE == "QS8": 321 const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]}); 322 __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); 323 __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); 324 $else: 325 __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero); 326 __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero); 327 328 vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer)); 329 vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (buffer + 4))); 330 buffer += 8; 331 332 __m128 vfpacc${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]}); 333 __m128 vfpacc${ABC[4:8]} = _mm_cvtepi32_ps(vacc${ABC[4:8]}); 334 335 vfpacc${ABC[0:4]} = _mm_mul_ps(vfpacc${ABC[0:4]}, vscale); 336 vfpacc${ABC[4:8]} = _mm_mul_ps(vfpacc${ABC[4:8]}, vscale); 337 338 vfpacc${ABC[0:4]} = _mm_min_ps(vfpacc${ABC[0:4]}, voutput_max_less_zero_point); 339 vfpacc${ABC[4:8]} = _mm_min_ps(vfpacc${ABC[4:8]}, voutput_max_less_zero_point); 340 341 vacc${ABC[0:4]} = _mm_cvtps_epi32(vfpacc${ABC[0:4]}); 342 vacc${ABC[4:8]} = _mm_cvtps_epi32(vfpacc${ABC[4:8]}); 343 344 __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point); 345 $if DATATYPE == "QS8": 346 vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, voutput_min); 347 348 __m128i vout${ABC[0:8]}${ABC[0:8]} = ${_MM_PACKXS_EPI16}(vout${ABC[0:8]}, vout${ABC[0:8]}); 349 $if DATATYPE == "QU8": 350 vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epu8(vout${ABC[0:8]}${ABC[0:8]}, voutput_min); 351 352 $if CHANNEL_TILE > 8: 353 if XNN_LIKELY(channels >= 8) { 354 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); 355 output += 8; 356 channels -= 8; 357 } else { 358 if (channels & 4) { 359 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); 360 vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); 361 output += 4; 362 } 363 uint32_t vout${ABC[0:4]} = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]}); 364 if (channels & 2) { 365 unaligned_store_u16(output, (uint16_t) vout${ABC[0:4]}); 366 vout${ABC[0:4]} >>= 16; 367 output += 2; 368 } 369 if (channels & 1) { 370 *output = (${XINT8_T}) vout${ABC[0:4]}; 371 output += 1; 372 } 373 channels = 0; 374 } 375 $else: 376 if (channels & 4) { 377 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); 378 vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); 379 output += 4; 380 } 381 uint32_t vout${ABC[0:4]} = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]}); 382 if (channels & 2) { 383 unaligned_store_u16(output, (uint16_t) vout${ABC[0:4]}); 384 vout${ABC[0:4]} >>= 16; 385 output += 2; 386 } 387 if (channels & 1) { 388 *output = (${XINT8_T}) vout${ABC[0:4]}; 389 } 390 }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} 391 } 392} 393