/external/XNNPACK/src/f32-ppmm/gen/ |
D | 8x8-minmax-neonfma.c | 89 const float32x4_t vb4567 = vld1q_f32(w); w += 4; in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() local 100 vacc0x4567 = vfmaq_laneq_f32(vacc0x4567, vb4567, va0123, 0); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 101 vacc1x4567 = vfmaq_laneq_f32(vacc1x4567, vb4567, va0123, 1); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 102 vacc2x4567 = vfmaq_laneq_f32(vacc2x4567, vb4567, va0123, 2); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 103 vacc3x4567 = vfmaq_laneq_f32(vacc3x4567, vb4567, va0123, 3); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 104 vacc4x4567 = vfmaq_laneq_f32(vacc4x4567, vb4567, va4567, 0); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 105 vacc5x4567 = vfmaq_laneq_f32(vacc5x4567, vb4567, va4567, 1); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 106 vacc6x4567 = vfmaq_laneq_f32(vacc6x4567, vb4567, va4567, 2); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 107 vacc7x4567 = vfmaq_laneq_f32(vacc7x4567, vb4567, va4567, 3); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 126 vacc0x4567 = vfmaq_f32(vacc0x4567, va0000, vb4567); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() [all …]
|
D | 4x8-minmax-neonfma.c | 64 const float32x4_t vb4567 = vld1q_f32(w); w += 4; in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() local 71 vacc0x4567 = vfmaq_laneq_f32(vacc0x4567, vb4567, va0123, 0); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 72 vacc1x4567 = vfmaq_laneq_f32(vacc1x4567, vb4567, va0123, 1); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 73 vacc2x4567 = vfmaq_laneq_f32(vacc2x4567, vb4567, va0123, 2); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 74 vacc3x4567 = vfmaq_laneq_f32(vacc3x4567, vb4567, va0123, 3); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 85 vacc0x4567 = vfmaq_f32(vacc0x4567, va0000, vb4567); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 86 vacc1x4567 = vfmaq_f32(vacc1x4567, va1111, vb4567); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 87 vacc2x4567 = vfmaq_f32(vacc2x4567, va2222, vb4567); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 88 vacc3x4567 = vfmaq_f32(vacc3x4567, va3333, vb4567); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma()
|
D | 8x8-minmax-neon.c | 89 const float32x4_t vb4567 = vld1q_f32(w); w += 4; in xnn_f32_ppmm_minmax_ukernel_8x8__neon() local 99 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567, vget_low_f32(va0123), 0); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 100 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567, vget_low_f32(va0123), 1); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 101 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567, vget_high_f32(va0123), 0); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 102 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567, vget_high_f32(va0123), 1); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 103 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567, vget_low_f32(va4567), 0); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 104 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567, vget_low_f32(va4567), 1); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 105 vacc6x4567 = vmlaq_lane_f32(vacc6x4567, vb4567, vget_high_f32(va4567), 0); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 106 vacc7x4567 = vmlaq_lane_f32(vacc7x4567, vb4567, vget_high_f32(va4567), 1); in xnn_f32_ppmm_minmax_ukernel_8x8__neon()
|
D | 4x8-minmax-wasmsimd-arm-splat.c | 67 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() local 79 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0000, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 80 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1111, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 81 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2222, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat() 82 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3333, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat()
|
D | 4x8-minmax-sse.c | 65 const __m128 vb4567 = _mm_load_ps(w + 4); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() local 77 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0000, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 78 vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1111, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 79 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2222, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__sse() 80 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3333, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__sse()
|
D | 4x8-minmax-neon.c | 64 const float32x4_t vb4567 = vld1q_f32(w); w += 4; in xnn_f32_ppmm_minmax_ukernel_4x8__neon() local 70 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567, vget_low_f32(va0123), 0); in xnn_f32_ppmm_minmax_ukernel_4x8__neon() 71 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567, vget_low_f32(va0123), 1); in xnn_f32_ppmm_minmax_ukernel_4x8__neon() 72 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567, vget_high_f32(va0123), 0); in xnn_f32_ppmm_minmax_ukernel_4x8__neon() 73 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567, vget_high_f32(va0123), 1); in xnn_f32_ppmm_minmax_ukernel_4x8__neon()
|
D | 4x8-minmax-wasmsimd-x86-splat.c | 65 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() local 77 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0000, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 78 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1111, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 79 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2222, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat() 80 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3333, vb4567)); in xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 6x8inc-minmax-wasmsimd-arm-loadsplat.c | 106 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() local 115 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 116 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 117 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 118 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 119 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 120 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_arm_loadsplat()
|
D | 5x8inc-minmax-sse-load1.c | 94 const __m128 vb4567 = _mm_load_ps(w + 4); in xnn_f32_gemminc_minmax_ukernel_5x8__sse_load1() local 102 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__sse_load1() 103 vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__sse_load1() 104 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__sse_load1() 105 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__sse_load1() 106 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__sse_load1()
|
D | 5x8inc-minmax-wasmsimd-arm-loadsplat.c | 96 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() local 104 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 105 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 106 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 107 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 108 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_5x8__wasmsimd_arm_loadsplat()
|
D | 6x8inc-minmax-wasmsimd-x86-loadsplat.c | 104 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() local 113 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 114 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 115 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 116 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 117 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 118 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567)); in xnn_f32_gemminc_minmax_ukernel_6x8__wasmsimd_x86_loadsplat()
|
D | 4x8inc-minmax-sse-load1.c | 84 const __m128 vb4567 = _mm_load_ps(w + 4); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_load1() local 91 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_load1() 92 vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_load1() 93 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_load1() 94 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567)); in xnn_f32_gemminc_minmax_ukernel_4x8__sse_load1()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 6x8-minmax-wasmsimd-arm-loadsplat.c | 104 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() local 113 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 114 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 115 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 116 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 117 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 118 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat()
|
D | 5x8-minmax-sse-load1.c | 92 const __m128 vb4567 = _mm_load_ps(w + 4); in xnn_f32_gemm_minmax_ukernel_5x8__sse_load1() local 100 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__sse_load1() 101 vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__sse_load1() 102 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__sse_load1() 103 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__sse_load1() 104 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__sse_load1()
|
D | 5x8-minmax-wasmsimd-arm-loadsplat.c | 94 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() local 102 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 103 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 104 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 105 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 106 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat()
|
D | 6x8-minmax-wasmsimd-x86-loadsplat.c | 102 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() local 111 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 112 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 113 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 114 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 115 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat() 116 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567)); in xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat()
|
D | 4x8-minmax-wasmsimd-arm-loadsplat.c | 84 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() local 91 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 92 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 93 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 94 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
|
D | 4x8-minmax-sse-load1.c | 82 const __m128 vb4567 = _mm_load_ps(w + 4); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() local 89 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 90 vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 91 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1() 92 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567)); in xnn_f32_gemm_minmax_ukernel_4x8__sse_load1()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 6x8-minmax-wasmsimd-arm-loadsplat.c | 119 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() local 136 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 138 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1, vb4567)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 140 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2, vb4567)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 142 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 144 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat() 146 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567)); in xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat()
|
D | 5x8-minmax-wasmsimd-arm-loadsplat.c | 108 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() local 123 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 125 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 127 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 129 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat() 131 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat()
|
D | 5x8-minmax-sse-load1.c | 106 const __m128 vb4567 = _mm_load_ps(w + 4); in xnn_f32_igemm_minmax_ukernel_5x8__sse_load1() local 121 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__sse_load1() 123 vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__sse_load1() 125 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__sse_load1() 127 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__sse_load1() 129 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567)); in xnn_f32_igemm_minmax_ukernel_5x8__sse_load1()
|
D | 4x8-minmax-wasmsimd-arm-loadsplat.c | 97 const v128_t vb4567 = wasm_v128_load(w + 4); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() local 110 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 112 vacc1x4567 = wasm_f32x4_add(vacc1x4567, wasm_f32x4_mul(va1, vb4567)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 114 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2, vb4567)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat() 116 vacc3x4567 = wasm_f32x4_add(vacc3x4567, wasm_f32x4_mul(va3, vb4567)); in xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat()
|
D | 4x8-minmax-sse-load1.c | 95 const __m128 vb4567 = _mm_load_ps(w + 4); in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1() local 108 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1() 110 vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1() 112 vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1() 114 vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567)); in xnn_f32_igemm_minmax_ukernel_4x8__sse_load1()
|
/external/XNNPACK/src/f32-vbinary/gen/ |
D | vmin-wasmsimd-x86-x8.c | 38 const v128_t vb4567 = wasm_v128_load(b + 4); in xnn_f32_vmin_ukernel__wasmsimd_x86_x8() local 42 const v128_t vm4567 = wasm_f32x4_lt(va4567, vb4567); in xnn_f32_vmin_ukernel__wasmsimd_x86_x8() 45 v128_t vy4567 = wasm_v128_bitselect(va4567, vb4567, vm4567); in xnn_f32_vmin_ukernel__wasmsimd_x86_x8()
|
D | vmax-wasmsimd-x86-x8.c | 38 const v128_t vb4567 = wasm_v128_load(b + 4); in xnn_f32_vmax_ukernel__wasmsimd_x86_x8() local 42 const v128_t vm4567 = wasm_f32x4_le(va4567, vb4567); in xnn_f32_vmax_ukernel__wasmsimd_x86_x8() 45 v128_t vy4567 = wasm_v128_bitselect(vb4567, va4567, vm4567); in xnn_f32_vmax_ukernel__wasmsimd_x86_x8()
|