/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x8c2-minmax-neon-mull-padal-dup.c | 84 const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() local 91 …const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 107 …const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 123 …const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 139 …const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 205 … const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() local 208 …const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 212 …const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 216 …const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 220 …const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup()
|
D | 3x8c2-minmax-neon-mull-padal-dup.c | 75 const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local 82 …const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 98 …const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 114 …const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 171 … const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local 174 …const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 178 …const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 182 …const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
|
D | 2x8c2-minmax-neon-mull-padal-dup.c | 66 const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup() local 73 …const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup() 89 …const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup() 137 … const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup() local 140 …const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup() 144 …const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x8c2-minmax-neon-mull-padal-dup.c | 101 … const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() local 108 …const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 124 …const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 140 …const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 156 …const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 222 … const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() local 225 …const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 229 …const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 233 …const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup() 237 …const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup()
|
D | 3x8c2-minmax-neon-mull-padal-dup.c | 90 … const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local 97 …const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 113 …const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 129 …const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 186 … const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() local 189 …const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 193 …const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup() 197 …const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_… in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 8x8s4-minmax-neonfma.c | 172 const float32x4_t vb0123c2 = vld1q_f32(w + 16); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() local 175 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 176 vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 177 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 178 vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 179 vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 180 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 181 vacc6x0123 = vfmaq_f32(vacc6x0123, va6, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 182 vacc7x0123 = vfmaq_f32(vacc7x0123, va7, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
|
D | 8x8s4-minmax-neon.c | 172 const float32x4_t vb0123c2 = vld1q_f32(w + 16); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() local 175 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 176 vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 177 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 178 vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 179 vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 180 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 181 vacc6x0123 = vmlaq_f32(vacc6x0123, va6, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 182 vacc7x0123 = vmlaq_f32(vacc7x0123, va7, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
|
D | 6x8-minmax-neonfma-lane-ld128.c | 128 const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() local 131 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 132 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 133 vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 134 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 135 vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c2, vget_high_f32(va4), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 136 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128()
|
D | 6x8s4-minmax-neon.c | 142 const float32x4_t vb0123c2 = vld1q_f32(w + 16); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() local 145 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 146 vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 147 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 148 vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 149 vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 150 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon()
|
D | 6x8s4-minmax-neonfma.c | 142 const float32x4_t vb0123c2 = vld1q_f32(w + 16); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() local 145 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 146 vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 147 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 148 vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 149 vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 150 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma()
|
D | 6x8-minmax-neon-lane-ld128.c | 128 const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() local 131 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 132 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 133 vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 134 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 135 vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c2, vget_high_f32(va4), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 136 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128()
|
D | 4x8-minmax-neonfma-lane-ld128.c | 102 const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() local 105 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 106 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 107 vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128() 108 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128()
|
D | 4x8-minmax-neon-lane-ld128.c | 102 const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() local 105 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 106 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 107 vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128() 108 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128()
|
D | 4x8-wasmsimd-splat.c | 118 const v128_t vb0123c2 = wasm_v128_load(w + 16); in xnn_f32_gemm_ukernel_4x8__wasmsimd_splat() local 121 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0c2, vb0123c2)); in xnn_f32_gemm_ukernel_4x8__wasmsimd_splat() 122 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c2, vb0123c2)); in xnn_f32_gemm_ukernel_4x8__wasmsimd_splat() 123 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemm_ukernel_4x8__wasmsimd_splat() 124 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c2, vb0123c2)); in xnn_f32_gemm_ukernel_4x8__wasmsimd_splat()
|
D | 5x8-relu-wasmsimd-splat.c | 135 const v128_t vb0123c2 = wasm_v128_load(w + 16); in xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat() local 138 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0c2, vb0123c2)); in xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat() 139 vacc1x0123 = wasm_f32x4_add(vacc1x0123, wasm_f32x4_mul(va1c2, vb0123c2)); in xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat() 140 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat() 141 vacc3x0123 = wasm_f32x4_add(vacc3x0123, wasm_f32x4_mul(va3c2, vb0123c2)); in xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat() 142 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4c2, vb0123c2)); in xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 8x8s4inc-minmax-neon.c | 174 const float32x4_t vb0123c2 = vld1q_f32(w + 16); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() local 177 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 178 vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 179 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 180 vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 181 vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 182 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 183 vacc6x0123 = vmlaq_f32(vacc6x0123, va6, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 184 vacc7x0123 = vmlaq_f32(vacc7x0123, va7, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
|
D | 8x8s4inc-minmax-neonfma.c | 174 const float32x4_t vb0123c2 = vld1q_f32(w + 16); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() local 177 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 178 vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 179 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 180 vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 181 vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 182 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 183 vacc6x0123 = vfmaq_f32(vacc6x0123, va6, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 184 vacc7x0123 = vfmaq_f32(vacc7x0123, va7, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
|
D | 6x8inc-minmax-neon-lane-ld128.c | 130 const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() local 133 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 134 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 135 vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 136 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 137 vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c2, vget_high_f32(va4), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 138 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128()
|
D | 6x8s4inc-minmax-neon.c | 144 const float32x4_t vb0123c2 = vld1q_f32(w + 16); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() local 147 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 148 vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 149 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 150 vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 151 vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 152 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon()
|
D | 6x8s4inc-minmax-neonfma.c | 144 const float32x4_t vb0123c2 = vld1q_f32(w + 16); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() local 147 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 148 vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 149 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 150 vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 151 vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 152 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma()
|
D | 6x8inc-minmax-neonfma-lane-ld128.c | 130 const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() local 133 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 134 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 135 vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 136 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 137 vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c2, vget_high_f32(va4), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 138 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 8x8s4-minmax-neonfma.c | 205 const float32x4_t vb0123c2 = vld1q_f32(w + 16); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() local 208 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 209 vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 210 vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 211 vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 212 vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 213 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 214 vacc6x0123 = vfmaq_f32(vacc6x0123, va6, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 215 vacc7x0123 = vfmaq_f32(vacc7x0123, va7, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
|
D | 8x8s4-minmax-neon.c | 205 const float32x4_t vb0123c2 = vld1q_f32(w + 16); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() local 208 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 209 vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 210 vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 211 vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 212 vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 213 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 214 vacc6x0123 = vmlaq_f32(vacc6x0123, va6, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 215 vacc7x0123 = vmlaq_f32(vacc7x0123, va7, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
|
D | 6x8-minmax-neonfma-lane-ld128.c | 156 const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() local 159 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 160 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 161 vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 162 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 163 vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c2, vget_high_f32(va4), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 164 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128()
|
D | 6x8-minmax-neon-lane-ld128.c | 156 const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() local 159 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 160 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 161 vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 162 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 163 vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c2, vget_high_f32(va4), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 164 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128()
|