/external/XNNPACK/src/f32-ppmm/gen/ |
D | 3x3-minmax-scalar.c | 52 float vacc2x2 = vacc0x2; in xnn_f32_ppmm_minmax_ukernel_3x3__scalar() local 75 vacc2x2 += va2 * vb2; in xnn_f32_ppmm_minmax_ukernel_3x3__scalar() 89 vacc2x2 = math_min_f32(vacc2x2, vmax); in xnn_f32_ppmm_minmax_ukernel_3x3__scalar() 100 vacc2x2 = math_max_f32(vacc2x2, vmin); in xnn_f32_ppmm_minmax_ukernel_3x3__scalar() 105 c2[2] = vacc2x2; in xnn_f32_ppmm_minmax_ukernel_3x3__scalar() 129 vacc2x0 = vacc2x2; in xnn_f32_ppmm_minmax_ukernel_3x3__scalar()
|
D | 4x4-minmax-scalar.c | 58 float vacc2x2 = vacc0x2; in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() local 90 vacc2x2 += va2 * vb2; in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() 111 vacc2x2 = math_min_f32(vacc2x2, vmax); in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() 129 vacc2x2 = math_max_f32(vacc2x2, vmin); in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() 143 c2[2] = vacc2x2; in xnn_f32_ppmm_minmax_ukernel_4x4__scalar() 174 vacc2x0 = vacc2x2; in xnn_f32_ppmm_minmax_ukernel_4x4__scalar()
|
/external/XNNPACK/src/bf16-gemm/gen/ |
D | 3x4c8-minmax-neonbf16-bfmlal.c | 66 float32x4_t vacc2x2 = vacc0x2; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() local 88 vacc2x2 = vbfmlalbq_f32(vacc2x2, va2, vb2); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 101 vacc2x2 = vbfmlaltq_f32(vacc2x2, va2, vb2); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 146 vacc2x2 = vbfmlalbq_f32(vacc2x2, va2x2, vb2); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 147 vacc2x2 = vbfmlaltq_f32(vacc2x2, va2x2, vb2); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 165 const float32x4_t vacc2x23 = vpaddq_f32(vacc2x2, vacc2x3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 179 const float32x2_t vsum2x2 = vadd_f32(vget_low_f32(vacc2x2), vget_high_f32(vacc2x2)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal()
|
D | 3x4c8-minmax-neonfma-shland.c | 67 float32x4_t vacc2x2 = vacc0x2; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 98 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 120 vacc2x2 = vfmaq_f32(vacc2x2, va2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 179 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 210 vacc2x2 = vfmaq_f32(vacc2x2, va2x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 222 const float32x4_t vacc2x23 = vpaddq_f32(vacc2x2, vacc2x3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 236 const float32x2_t vsum2x2 = vadd_f32(vget_low_f32(vacc2x2), vget_high_f32(vacc2x2)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
|
D | 3x4c8-minmax-neonfma-zip.c | 67 float32x4_t vacc2x2 = vacc0x2; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 98 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 120 vacc2x2 = vfmaq_f32(vacc2x2, va2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 179 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 210 vacc2x2 = vfmaq_f32(vacc2x2, va2x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 222 const float32x4_t vacc2x23 = vpaddq_f32(vacc2x2, vacc2x3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 236 const float32x2_t vsum2x2 = vadd_f32(vget_low_f32(vacc2x2), vget_high_f32(vacc2x2)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonbf16-bfmlal.c | 72 float32x4_t vacc2x2 = vacc0x2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() local 101 vacc2x2 = vbfmlalbq_f32(vacc2x2, va2, vb2); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 118 vacc2x2 = vbfmlaltq_f32(vacc2x2, va2, vb2); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 172 vacc2x2 = vbfmlalbq_f32(vacc2x2, va2x2, vb2); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 173 vacc2x2 = vbfmlaltq_f32(vacc2x2, va2x2, vb2); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 198 const float32x4_t vacc2x23 = vpaddq_f32(vacc2x2, vacc2x3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 216 const float32x2_t vsum2x2 = vadd_f32(vget_low_f32(vacc2x2), vget_high_f32(vacc2x2)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal()
|
D | 3x4c8-minmax-neonbf16-bfdot.c | 66 float32x4_t vacc2x2 = vacc0x2; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() local 88 vacc2x2 = vbfdotq_f32(vacc2x2, va2, vb2); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 125 vacc2x2 = vbfdotq_f32(vacc2x2, va2x2, vb2); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 140 const float32x4_t vacc2x23 = vpaddq_f32(vacc2x2, vacc2x3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 154 const float32x2_t vsum2x2 = vadd_f32(vget_low_f32(vacc2x2), vget_high_f32(vacc2x2)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot()
|
D | 4x4c8-minmax-neonfma-zip.c | 73 float32x4_t vacc2x2 = vacc0x2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 112 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 139 vacc2x2 = vfmaq_f32(vacc2x2, va2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 211 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 250 vacc2x2 = vfmaq_f32(vacc2x2, va2x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 265 const float32x4_t vacc2x23 = vpaddq_f32(vacc2x2, vacc2x3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 283 const float32x2_t vsum2x2 = vadd_f32(vget_low_f32(vacc2x2), vget_high_f32(vacc2x2)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonfma-shland.c | 73 float32x4_t vacc2x2 = vacc0x2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 112 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 139 vacc2x2 = vfmaq_f32(vacc2x2, va2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 211 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 250 vacc2x2 = vfmaq_f32(vacc2x2, va2x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 265 const float32x4_t vacc2x23 = vpaddq_f32(vacc2x2, vacc2x3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 283 const float32x2_t vsum2x2 = vadd_f32(vget_low_f32(vacc2x2), vget_high_f32(vacc2x2)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
|
D | 5x4c8-minmax-neonbf16-bfmlal.c | 78 float32x4_t vacc2x2 = vacc0x2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() local 114 vacc2x2 = vbfmlalbq_f32(vacc2x2, va2, vb2); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 135 vacc2x2 = vbfmlaltq_f32(vacc2x2, va2, vb2); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 198 vacc2x2 = vbfmlalbq_f32(vacc2x2, va2x2, vb2); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 199 vacc2x2 = vbfmlaltq_f32(vacc2x2, va2x2, vb2); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 231 const float32x4_t vacc2x23 = vpaddq_f32(vacc2x2, vacc2x3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 253 const float32x2_t vsum2x2 = vadd_f32(vget_low_f32(vacc2x2), vget_high_f32(vacc2x2)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal()
|
D | 4x4c8-minmax-neonbf16-bfdot.c | 72 float32x4_t vacc2x2 = vacc0x2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot() local 101 vacc2x2 = vbfdotq_f32(vacc2x2, va2, vb2); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot() 145 vacc2x2 = vbfdotq_f32(vacc2x2, va2x2, vb2); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot() 165 const float32x4_t vacc2x23 = vpaddq_f32(vacc2x2, vacc2x3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot() 183 const float32x2_t vsum2x2 = vadd_f32(vget_low_f32(vacc2x2), vget_high_f32(vacc2x2)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot()
|
/external/XNNPACK/src/qc8-gemm/gen/ |
D | 3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c | 66 v128_t vacc2x2 = vacc0x2; in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128() local 97 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_dot_i16x8(vxa2, vxb2)); in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128() 108 …02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, … in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128()
|
D | 3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c | 66 v128_t vacc2x2 = vacc0x2; in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64() local 93 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_dot_i16x8(vxa2, vxb2)); in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64() 108 …02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, … in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64()
|
D | 3x4c8-minmax-fp32-sse2-ld64.c | 67 __m128i vacc2x2 = vacc0x2; in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64() local 100 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2)); in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64() 116 … vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x… in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64()
|
D | 3x4c8-minmax-fp32-sse2-ld128.c | 67 __m128i vacc2x2 = vacc0x2; in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128() local 103 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2)); in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128() 114 … vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x… in xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c | 66 v128_t vacc2x2 = vacc0x2; in xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64() local 93 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_dot_i16x8(vxa2, vxb2)); in xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64() 108 …02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, … in xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64()
|
D | 3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c | 66 v128_t vacc2x2 = vacc0x2; in xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128() local 97 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_dot_i16x8(vxa2, vxb2)); in xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128() 108 …02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, … in xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128()
|
D | 3x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c | 66 v128_t vacc2x2 = vacc0x2; in xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2() local 93 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_dot_i16x8(vxa2, vxb2)); in xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2() 108 …02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, … in xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2()
|
D | 3x4c8-minmax-fp32-sse2-ld128.c | 67 __m128i vacc2x2 = vacc0x2; in xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128() local 103 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2)); in xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128() 114 … vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x… in xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128()
|
/external/XNNPACK/src/qu8-gemm/gen/ |
D | 3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c | 67 v128_t vacc2x2 = vacc0x2; in xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64() local 94 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_dot_i16x8(vxa2, vxb2)); in xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64() 109 …02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, … in xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64()
|
/external/XNNPACK/src/qc8-igemm/gen/ |
D | 3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c | 65 v128_t vacc2x2 = vacc0x2; in xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128() local 112 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_dot_i16x8(vxa2, vxb2)); in xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128() 125 …02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, … in xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128()
|
D | 3x4c8-minmax-fp32-sse2-ld64.c | 66 __m128i vacc2x2 = vacc0x2; in xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64() local 115 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2)); in xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64() 133 … vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x… in xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x4c8-minmax-fp32-sse2-ld128.c | 66 __m128i vacc2x2 = vacc0x2; in xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128() local 118 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2)); in xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128() 131 … vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x… in xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128()
|
D | 3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c | 65 v128_t vacc2x2 = vacc0x2; in xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128() local 112 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_dot_i16x8(vxa2, vxb2)); in xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128() 125 …02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, … in xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128()
|
/external/XNNPACK/src/qu8-igemm/gen/ |
D | 3x4c8-minmax-fp32-sse2-ld64.c | 66 __m128i vacc2x2 = vacc0x2; in xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64() local 117 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2)); in xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64() 135 … vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x… in xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64()
|