// Copyright 2022 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. $assert DATATYPE in ["QS8", "QU8"] $assert BATCH_TILE % 4 == 0 $SIMD_TILE = BATCH_TILE // 4 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" #include #include #include #include #include #include $XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] $XINT8X4_T = {"QS8": "int8x4_t", "QU8": "uint8x4_t"}[DATATYPE] $XINT16X2_T = {"QS8": "int16x2_t", "QU8": "uint16x2_t"}[DATATYPE] $__XXTAB16 = {"QS8": "__sxtab16", "QU8": "__uxtab16"}[DATATYPE] $__XSAT = {"QS8": "__ssat", "QU8": "__usat"}[DATATYPE] void xnn_${DATATYPE.lower()}_vcvt_ukernel__armsimd32_x${BATCH_TILE}( size_t n, const ${XINT8_T}* x, ${XINT8_T}* y, const union xnn_${DATATYPE.lower()}_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { const ${XINT16X2_T} vminus_input_zero_point = (${XINT16X2_T}) params->armsimd32.minus_input_zero_point; const int32_t vbias = params->armsimd32.bias; const int32_t vmultiplier = params->armsimd32.multiplier; $if BATCH_TILE > 4: for (; n >= ${BATCH_TILE} * sizeof(${XINT8_T}); n -= ${BATCH_TILE} * sizeof(${XINT8_T})) { $for N in range(SIMD_TILE): const ${XINT8X4_T} vx${ABC[4*N:4*N+4]} = (${XINT8X4_T}) unaligned_indexed_load_u32(x, ${N}); x += ${BATCH_TILE}; $for N in range(0, BATCH_TILE, 4): const ${XINT16X2_T} vx${ABC[N]}${ABC[N+2]} = ${__XXTAB16}(vminus_input_zero_point, vx${ABC[N:N+4]}); const ${XINT16X2_T} vx${ABC[N+1]}${ABC[N+3]} = ${__XXTAB16}(vminus_input_zero_point, __ror(vx${ABC[N:N+4]}, 8)); $for N in range(0, BATCH_TILE, 4): int32_t vacc${ABC[N]} = __smlawb(vmultiplier, vx${ABC[N]}${ABC[N+2]}, vbias); int32_t vacc${ABC[N+1]} = __smlawb(vmultiplier, vx${ABC[N+1]}${ABC[N+3]}, vbias); int32_t vacc${ABC[N+2]} = __smlawt(vmultiplier, vx${ABC[N]}${ABC[N+2]}, vbias); int32_t vacc${ABC[N+3]} = __smlawt(vmultiplier, vx${ABC[N+1]}${ABC[N+3]}, vbias); $for N in range(BATCH_TILE): vacc${ABC[N]} = ${__XSAT}(math_asr_s32(vacc${ABC[N]}, 1), 8); $for N in range(BATCH_TILE): y[${N}] = (${XINT8_T}) vacc${ABC[N]}; y += ${BATCH_TILE}; } for (; n >= 4 * sizeof(${XINT8_T}); n -= 4 * sizeof(${XINT8_T})) { const ${XINT8X4_T} vx0123 = (${XINT8X4_T}) unaligned_load_u32(x); x += 4; const ${XINT16X2_T} vx02 = ${__XXTAB16}(vminus_input_zero_point, vx0123); const ${XINT16X2_T} vx13 = ${__XXTAB16}(vminus_input_zero_point, __ror(vx0123, 8)); int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias); int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias); int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); int32_t vacc3 = __smlawt(vmultiplier, vx13, vbias); vacc0 = ${__XSAT}(math_asr_s32(vacc0, 1), 8); vacc1 = ${__XSAT}(math_asr_s32(vacc1, 1), 8); vacc2 = ${__XSAT}(math_asr_s32(vacc2, 1), 8); vacc3 = ${__XSAT}(math_asr_s32(vacc3, 1), 8); y[0] = (${XINT8_T}) vacc0; y[1] = (${XINT8_T}) vacc1; y[2] = (${XINT8_T}) vacc2; y[3] = (${XINT8_T}) vacc3; y += 4; } if XNN_UNLIKELY(n != 0) { const ${XINT8X4_T} vx0123 = (${XINT8X4_T}) unaligned_load_u32(x); const ${XINT16X2_T} vx02 = ${__XXTAB16}(vminus_input_zero_point, vx0123); const ${XINT16X2_T} vx13 = ${__XXTAB16}(vminus_input_zero_point, __ror(vx0123, 8)); int32_t vacc0 = __smlawb(vmultiplier, vx02, vbias); int32_t vacc1 = __smlawb(vmultiplier, vx13, vbias); const int32_t vacc2 = __smlawt(vmultiplier, vx02, vbias); vacc0 = ${__XSAT}(math_asr_s32(vacc0, 1), 8); vacc1 = ${__XSAT}(math_asr_s32(vacc1, 1), 8); if (n & (2 * sizeof(${XINT8_T}))) { y[0] = (${XINT8_T}) vacc0; y[1] = (${XINT8_T}) vacc1; vacc0 = ${__XSAT}(math_asr_s32(vacc2, 1), 8); y += 2; } if (n & (1 * sizeof(${XINT8_T}))) { y[0] = (${XINT8_T}) vacc0; } } }