Home
last modified time | relevance | path

Searched refs:vacc (Results 1 – 25 of 251) sorted by relevance

1234567891011

/external/XNNPACK/src/f32-hswish/
Dscalar.c.in39 float vacc${ABC[N]} = vx${ABC[N]} * vsixth + vhalf;
42 vacc${ABC[N]} = ${MAX_F32}(vacc${ABC[N]}, 0.0f);
45 vacc${ABC[N]} = ${MIN_F32}(vacc${ABC[N]}, vone);
48 vacc${ABC[N]} *= vx${ABC[N]};
51 y[${N}] = vacc${ABC[N]};
58 float vacc = vx * vsixth + vhalf;
59 vacc = ${MAX_F32}(vacc, 0.0f);
60 vacc = ${MIN_F32}(vacc, vone);
61 vacc = vacc * vx;
62 *y++ = vacc;
[all …]
Davx.c.in42 __m256 vacc${ABC[N:N+8]} = _mm256_fmadd_ps(vx${ABC[N:N+8]}, vsixth, vhalf);
45 __m256 vacc${ABC[N:N+8]} = _mm256_mul_ps(vx${ABC[N:N+8]}, vsixth);
48 vacc${ABC[N:N+8]} = _mm256_add_ps(vacc${ABC[N:N+8]}, vhalf);
51 vacc${ABC[N:N+8]} = _mm256_max_ps(vacc${ABC[N:N+8]}, vzero);
54 vacc${ABC[N:N+8]} = _mm256_min_ps(vacc${ABC[N:N+8]}, vone);
57 vacc${ABC[N:N+8]} = _mm256_mul_ps(vacc${ABC[N:N+8]}, vx${ABC[N:N+8]});
59 _mm256_storeu_ps(y, vacc${ABC[0:8]});
61 _mm256_storeu_ps(y + ${N}, vacc${ABC[N:N+8]});
69 __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf);
71 __m256 vacc = _mm256_mul_ps(vx, vsixth);
[all …]
Davx512f.c.in39 __m512 vacc${ABC[N:N+16]} = _mm512_fmadd_ps(vx${ABC[N:N+16]}, vsixth, vhalf);
42 vacc${ABC[N:N+16]} = _mm512_max_ps(vacc${ABC[N:N+16]}, vzero);
45 vacc${ABC[N:N+16]} = _mm512_min_ps(vacc${ABC[N:N+16]}, vone);
48 vacc${ABC[N:N+16]} = _mm512_mul_ps(vacc${ABC[N:N+16]}, vx${ABC[N:N+16]});
50 _mm512_storeu_ps(y, vacc${ABC[0:16]});
52 _mm512_storeu_ps(y + ${N}, vacc${ABC[N:N+16]});
59 __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf); variable
60 vacc = _mm512_max_ps(vacc, vzero);
61 vacc = _mm512_min_ps(vacc, vone);
62 vacc = _mm512_mul_ps(vacc, vx);
[all …]
/external/XNNPACK/src/f32-hswish/gen/
Davx-x8.c54 __m256 vacc = _mm256_mul_ps(vx, vsixth); in xnn_f32_hswish_ukernel__avx_x8() local
55 vacc = _mm256_add_ps(vacc, vhalf); in xnn_f32_hswish_ukernel__avx_x8()
56 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x8()
57 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx_x8()
58 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx_x8()
59 _mm256_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__avx_x8()
68 __m256 vacc = _mm256_mul_ps(vx, vsixth); in xnn_f32_hswish_ukernel__avx_x8() local
69 vacc = _mm256_add_ps(vacc, vhalf); in xnn_f32_hswish_ukernel__avx_x8()
70 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x8()
71 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx_x8()
[all …]
Davx-x16.c61 __m256 vacc = _mm256_mul_ps(vx, vsixth); in xnn_f32_hswish_ukernel__avx_x16() local
62 vacc = _mm256_add_ps(vacc, vhalf); in xnn_f32_hswish_ukernel__avx_x16()
63 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x16()
64 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx_x16()
65 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx_x16()
66 _mm256_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__avx_x16()
75 __m256 vacc = _mm256_mul_ps(vx, vsixth); in xnn_f32_hswish_ukernel__avx_x16() local
76 vacc = _mm256_add_ps(vacc, vhalf); in xnn_f32_hswish_ukernel__avx_x16()
77 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x16()
78 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx_x16()
[all …]
Davx512f-x16.c51 __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__avx512f_x16() local
52 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x16()
53 vacc = _mm512_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx512f_x16()
54 vacc = _mm512_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx512f_x16()
55 _mm512_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__avx512f_x16()
66 __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__avx512f_x16() local
67 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x16()
68 vacc = _mm512_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx512f_x16()
69 vacc = _mm512_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx512f_x16()
70 _mm512_mask_storeu_ps(y, vmask, vacc); in xnn_f32_hswish_ukernel__avx512f_x16()
Dfma3-x8.c52 __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__fma3_x8() local
53 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x8()
54 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__fma3_x8()
55 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__fma3_x8()
56 _mm256_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__fma3_x8()
65 __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__fma3_x8() local
66 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x8()
67 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__fma3_x8()
68 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__fma3_x8()
71 __m128 vacc_lo = _mm256_castps256_ps128(vacc); in xnn_f32_hswish_ukernel__fma3_x8()
[all …]
Dfma3-x16.c58 __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__fma3_x16() local
59 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x16()
60 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__fma3_x16()
61 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__fma3_x16()
62 _mm256_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__fma3_x16()
71 __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__fma3_x16() local
72 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x16()
73 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__fma3_x16()
74 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__fma3_x16()
77 __m128 vacc_lo = _mm256_castps256_ps128(vacc); in xnn_f32_hswish_ukernel__fma3_x16()
[all …]
Davx512f-x32.c57 __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__avx512f_x32() local
58 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x32()
59 vacc = _mm512_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx512f_x32()
60 vacc = _mm512_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx512f_x32()
61 _mm512_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__avx512f_x32()
72 __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__avx512f_x32() local
73 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x32()
74 vacc = _mm512_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx512f_x32()
75 vacc = _mm512_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx512f_x32()
76 _mm512_mask_storeu_ps(y, vmask, vacc); in xnn_f32_hswish_ukernel__avx512f_x32()
/external/XNNPACK/src/f32-dwconv/
Dup-avx512.c.in42 __m512 vacc${ABC[0:16]}p0 = _mm512_load_ps(w);
44 __m512 vacc${ABC[C:C+16]}p0 = _mm512_load_ps(w + ${C});
57 … __m512 vacc${ABC[C:C+16]}p${K} = _mm512_mul_ps(vi${K}x${ABC[C:C+16]}, vk${K}x${ABC[C:C+16]});
59vacc${ABC[C:C+16]}p${K % ACCUMULATORS} = _mm512_fmadd_ps(vi${K}x${ABC[C:C+16]}, vk${K}x${ABC[C:C+1…
64 // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
70vacc${ABC[C:C+16]}p${A} = _mm512_add_ps(vacc${ABC[C:C+16]}p${A}, vacc${ABC[C:C+16]}p${A + ACC_SLIC…
74 __m512 vacc${ABC[C:C+16]} = _mm512_max_ps(vacc${ABC[C:C+16]}p0, vmin);
76 vacc${ABC[C:C+16]} = _mm512_min_ps(vacc${ABC[C:C+16]}, vmax);
78 _mm512_storeu_ps(output, vacc${ABC[0:16]});
80 _mm512_storeu_ps(output + ${C}, vacc${ABC[C:C+16]});
[all …]
/external/XNNPACK/src/f32-vmulcaddc/
Dneon.c.in62 float32x4_t vacc${M}x${ABC[C:C+4]} = vld1q_f32(i${M}); i${M} += 4;
67 vacc${M}x${ABC[C:C+4]} = vmulq_f32(vacc${M}x${ABC[C:C+4]}, vscale${ABC[C:C+4]});
75 vacc${M}x${ABC[C:C+4]} = vaddq_f32(vacc${M}x${ABC[C:C+4]}, vbias${ABC[C:C+4]});
79vacc${M}x${ABC[C:C+4]} = vfmaq_f32(vbias${ABC[C:C+4]}, vscale${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]}…
83 vacc${M}x${ABC[C:C+4]} = vmaxq_f32(vacc${M}x${ABC[C:C+4]}, vmin);
87 vacc${M}x${ABC[C:C+4]} = vminq_f32(vacc${M}x${ABC[C:C+4]}, vmax);
91 vst1q_f32(o${M}, vacc${M}x${ABC[C:C+4]}); o${M} += 4;
98 float32x4_t vacc${M}x0123 = vld1q_f32(i${M}); i${M} += 4;
102 vacc${M}x0123 = vmulq_f32(vacc${M}x0123, vscale0123);
108 vacc${M}x0123 = vaddq_f32(vacc${M}x0123, vbias0123);
[all …]
Dsse.c.in62 __m128 vacc${M}x${ABC[0:4]} = _mm_loadu_ps(i${M});
64 __m128 vacc${M}x${ABC[C:C+4]} = _mm_loadu_ps(i${M} + ${C});
69 vacc${M}x${ABC[C:C+4]} = _mm_mul_ps(vacc${M}x${ABC[C:C+4]}, vscale${ABC[C:C+4]});
76 vacc${M}x${ABC[C:C+4]} = _mm_add_ps(vacc${M}x${ABC[C:C+4]}, vbias${ABC[C:C+4]});
80 vacc${M}x${ABC[C:C+4]} = _mm_max_ps(vacc${M}x${ABC[C:C+4]}, vmin);
84 vacc${M}x${ABC[C:C+4]} = _mm_min_ps(vacc${M}x${ABC[C:C+4]}, vmax);
87 _mm_storeu_ps(o${M}, vacc${M}x${ABC[0:4]});
89 _mm_storeu_ps(o${M} + ${C}, vacc${M}x${ABC[C:C+4]});
99 __m128 vacc${M}x0123 = _mm_loadu_ps(i${M});
103 vacc${M}x0123 = _mm_mul_ps(vacc${M}x0123, vscale0123);
[all …]
Dpsimd.c.in62 psimd_f32 vacc${M}x${ABC[0:4]} = psimd_load_f32(i${M});
64 psimd_f32 vacc${M}x${ABC[C:C+4]} = psimd_load_f32(i${M} + ${C});
72vacc${M}x${ABC[C:C+4]} = psimd_qfma_f32(vbias${ABC[C:C+4]}, vscale${ABC[C:C+4]}, vacc${M}x${ABC[C:…
76 vacc${M}x${ABC[C:C+4]} = psimd_max_f32(vacc${M}x${ABC[C:C+4]}, vmin);
80 vacc${M}x${ABC[C:C+4]} = psimd_min_f32(vacc${M}x${ABC[C:C+4]}, vmax);
83 psimd_store_f32(o${M}, vacc${M}x${ABC[0:4]});
85 psimd_store_f32(o${M} + ${C}, vacc${M}x${ABC[C:C+4]});
95 psimd_f32 vacc${M}x0123 = psimd_load_f32(i${M});
101 vacc${M}x0123 = psimd_qfma_f32(vbias0123, vscale0123, vacc${M}x0123);
104 vacc${M}x0123 = psimd_max_f32(vacc${M}x0123, vmin);
[all …]
Dscalar.c.in62 float vacc${M}x${ABC[C]} = i${M}[${C}];
70 vacc${M}x${ABC[C]} = vacc${M}x${ABC[C]} * vscale${ABC[C]} + vbias${ABC[C]};
74 vacc${M}x${ABC[C]} = ${MAX_F32}(vacc${M}x${ABC[C]}, vmin);
78 vacc${M}x${ABC[C]} = ${MIN_F32}(vacc${M}x${ABC[C]}, vmax);
82 o${M}[${C}] = vacc${M}x${ABC[C]};
92 float vacc${M} = *i${M}++;
97 vacc${M} = vacc${M} * vscale + vbias;
100 vacc${M} = ${MAX_F32}(vacc${M}, vmin);
103 vacc${M} = ${MIN_F32}(vacc${M}, vmax);
106 *o${M}++ = vacc${M};
[all …]
/external/XNNPACK/src/f32-igemm/
DMRx2c4-psimd.c.in63 psimd_f32 vacc${M}x${N}c4 = vacc0x${N}c4;
89 vacc${M}x${N}c4 = psimd_qfma_f32(vacc${M}x${N}c4, va${M}, vb${N});
106vacc${M}x${N}c4 = psimd_qfma_f32(vacc${M}x${N}c4, psimd_andmask_f32(vmask${N}, va${M}), vb${N});
112 …nst psimd_f32 vacc${M}x01c2 = psimd_add_f32(psimd_interleave_lo_f32(vacc${M}x0c4, vacc${M}x1c4), p…
115 …psimd_f32 vacc${M}${M+1}x01 = psimd_add_f32(psimd_concat_lo_f32(vacc${M}x01c2, vacc${M+1}x01c2), p…
119 vacc${M}${M+1}x01 = psimd_min_f32(vacc${M}${M+1}x01, vmax);
123 vacc${M}${M+1}x01 = psimd_max_f32(vacc${M}${M+1}x01, vmin);
127 psimd_store2_f32(c${M+1}, psimd_concat_hi_f32(vacc${M}${M+1}x01, vacc${M}${M+1}x01));
129 psimd_store2_f32(c${M}, vacc${M}${M+1}x01);
137 psimd_store1_f32(c${M+1}, psimd_concat_hi_f32(vacc${M}${M+1}x01, vacc${M}${M+1}x01));
[all …]
DMRx2c4-sse.c.in63 __m128 vacc${M}x${N}c4 = vacc0x${N}c4;
89 vacc${M}x${N}c4 = _mm_add_ps(vacc${M}x${N}c4, _mm_mul_ps(va${M}, vb${N}));
105vacc${M}x${N}c4 = _mm_add_ps(vacc${M}x${N}c4, _mm_mul_ps(_mm_andnot_ps(vmask${N}, va${M}), vb${N})…
111 …const __m128 vacc${M}x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc${M}x0c4, vacc${M}x1c4), _mm_unpackhi_…
114 …__m128 vacc${M}${M+1}x01 = _mm_add_ps(_mm_movelh_ps(vacc${M}x01c2, vacc${M+1}x01c2), _mm_movehl_ps…
118 vacc${M}${M+1}x01 = _mm_min_ps(vacc${M}${M+1}x01, vmax);
122 vacc${M}${M+1}x01 = _mm_max_ps(vacc${M}${M+1}x01, vmin);
126 _mm_storeh_pi((__m64*) c${M+1}, vacc${M}${M+1}x01);
128 _mm_storel_pi((__m64*) c${M}, vacc${M}${M+1}x01);
136 _mm_store_ss(c${M+1}, _mm_movehl_ps(vacc${M}${M+1}x01, vacc${M}${M+1}x01));
[all …]
Davx-shuffle4.c.in64 __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]};
91vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}c${L}, vacc${M}x${ABC[N:N+8]});
93vacc${M}x${ABC[N:N+8]} = _mm_add_ps(vacc${M}x${ABC[N:N+8]}, _mm_mul_ps(va${M}, vb${ABC[N:N+8]}c${L…
116vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]});
118vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8…
129 vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
134 vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin);
138 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
140 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
151 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
[all …]
Davx-broadcast.c.in64 __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]};
91vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]});
93vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8…
102 vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
107 vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin);
111 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
113 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
124 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
126 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
130 vacc${M}x${ABC[N:N+8]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+8]};
[all …]
/external/XNNPACK/src/f32-prelu/
Dpsimd.c.in70 psimd_f32 vacc${M}x${ABC[C:C+4]} = psimd_mul_f32(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]});
74vacc${M}x${ABC[C:C+4]} = psimd_signblend_f32(vi${M}x${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]}, vi${M}x…
78 vacc${M}x${ABC[C:C+4]} = psimd_max_f32(vacc${M}x${ABC[C:C+4]}, vmin);
82 vacc${M}x${ABC[C:C+4]} = psimd_min_f32(vacc${M}x${ABC[C:C+4]}, vmax);
85 psimd_store_f32(o${M}, vacc${M}x${ABC[0:4]});
87 psimd_store_f32(o${M} + ${C}, vacc${M}x${ABC[C:C+4]});
100 psimd_f32 vacc${M}x0123 = psimd_mul_f32(vi${M}x0123, vw0123);
103 vacc${M}x0123 = psimd_signblend_f32(vi${M}x0123, vacc${M}x0123, vi${M}x0123);
106 vacc${M}x0123 = psimd_max_f32(vacc${M}x0123, vmin);
109 vacc${M}x0123 = psimd_min_f32(vacc${M}x0123, vmax);
[all …]
Dneon.c.in66 float32x4_t vacc${M}x${ABC[C:C+4]} = vmulq_f32(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]});
71vacc${M}x${ABC[C:C+4]} = vbslq_f32(vm${M}x${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]}, vi${M}x${ABC[C:C+…
75 vacc${M}x${ABC[C:C+4]} = vmaxq_f32(vacc${M}x${ABC[C:C+4]}, vmin);
79 vacc${M}x${ABC[C:C+4]} = vminq_f32(vacc${M}x${ABC[C:C+4]}, vmax);
83 vst1q_f32(o${M}, vacc${M}x${ABC[C:C+4]}); o${M} += 4;
94 float32x4_t vacc${M}x0123 = vmulq_f32(vi${M}x0123, vw0123);
98 vacc${M}x0123 = vbslq_f32(vm${M}x0123, vacc${M}x0123, vi${M}x0123);
101 vacc${M}x0123 = vmaxq_f32(vacc${M}x0123, vmin);
104 vacc${M}x0123 = vminq_f32(vacc${M}x0123, vmax);
107 vst1q_f32(o${M}, vacc${M}x0123); o${M} += 4;
[all …]
/external/XNNPACK/src/f32-gemm/
Davx-shuffle4.c.in66 __m256 vacc${M}x${ABC[N:N+8]} = _mm256_load_ps(acc + ${M*NR+N});
73 __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]};
90vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}c${L}, vacc${M}x${ABC[N:N+8]});
92vacc${M}x${ABC[N:N+8]} = _mm_add_ps(vacc${M}x${ABC[N:N+8]}, _mm_mul_ps(va${M}, vb${ABC[N:N+8]}c${L…
115vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]});
117vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8…
126 vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
131 vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin);
135 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
137 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
[all …]
Davx-broadcast.c.in66 __m256 vacc${M}x${ABC[N:N+8]} = _mm256_load_ps(acc + ${M*NR+N});
73 __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]};
90vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]});
92vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8…
100 vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax);
105 vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin);
109 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
111 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
124 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]});
126 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
[all …]
/external/XNNPACK/src/f16-gemm/
Dneonfp16arith-ld64.c.in63 float16x8_t vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]};
76vacc${M}x${ABC[N:N+8]} = vfmaq_lane_f16(vacc${M}x${ABC[N:N+8]}, vb${ABC[N:N+8]}c${L}, va${M}, ${L}…
83vacc${M}x${ABC[N:N+8]} = vfmaq_f16(vacc${M}x${ABC[N:N+8]}, va${M}c${L}, vb${ABC[N:N+8]}c${L});
98 vacc${M}x${ABC[N:N+8]} = vfmaq_f16(vacc${M}x${ABC[N:N+8]}, va${M}, vb${ABC[N:N+8]});
107 vacc${M}x${ABC[N:N+8]} = vmulq_f16(vacc${M}x${ABC[N:N+8]}, vscale);
112 vacc${M}x${ABC[N:N+8]} = vminq_f16(vacc${M}x${ABC[N:N+8]}, vmax);
117 vacc${M}x${ABC[N:N+8]} = vmaxq_f16(vacc${M}x${ABC[N:N+8]}, vmin);
121 vst1q_f16(c${M}, vacc${M}x${ABC[0:8]});
123 vst1q_f16(c${M} + ${N}, vacc${M}x${ABC[N:N+8]});
137 vst1q_f16(c${M}, vacc${M}x${ABC[N:N+8]}); c${M} += 8;
[all …]
/external/XNNPACK/src/f32-spmm/
Dneon-blocked.c.in39 float32x4_t vacc${ABC[0:4]}c${N} = vld1q_dup_f32(w); w += 1;
41 float32x4_t vacc${ABC[M:M+4]}c${N} = vacc${ABC[0:4]}c${N};
60 vacc${ABC[M:M+4]}c0 = vfmaq_f32(vacc${ABC[M:M+4]}c0, va${ABC[M:M+4]}, vb);
64vacc${ABC[M:M+4]}c${N} = vfmaq_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[M:M+4]}c${N}, va${ABC[M…
69 float32x4_t vout${ABC[M:M+4]}c${N} = vminq_f32(vacc${ABC[M:M+4]}c${N}, vmax);
86 float32x4_t vacc${ABC[0:4]} = vld1q_dup_f32(w); w += 1;
88 float32x4_t vacc${ABC[M:M+4]} = vacc${ABC[0:4]};
98 vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, va${ABC[M:M+4]}, vb);
102 float32x4_t vout${ABC[M:M+4]} = vminq_f32(vacc${ABC[M:M+4]}, vmax);
130 float32x2_t vacc${ABC[0:SUBMR]}c${N} = vld1_dup_f32(w); w += 1;
[all …]
/external/XNNPACK/src/f32-ppmm/
Dneon.c.in55 float32x4_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]};
69vacc${M}x${ABC[N:N+4]} = vfmaq_laneq_f32(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}, va${ABC[M&-4:4+M…
79vacc${M}x${ABC[N:N+4]} = vfmaq_f32(vacc${M}x${ABC[N:N+4]}, va${MMMM}, vb${ABC[N:N+4]});
85vacc${M}x${ABC[N:N+4]} = vmlaq_lane_f32(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}, ${VGET_PART_F32}(…
93 vacc${M}x${ABC[N:N+4]} = vminq_f32(vacc${M}x${ABC[N:N+4]}, vmax);
98 vacc${M}x${ABC[N:N+4]} = vmaxq_f32(vacc${M}x${ABC[N:N+4]}, vmin);
102 vst1q_f32(c${M}, vacc${M}x${ABC[0:4]});
104 vst1q_f32(c${M} + ${N}, vacc${M}x${ABC[N:N+4]});
115 float32x2_t vacc${M}x01 = vget_low_f32(vacc${M}x0123);
120 vst1q_f32(c${M}, vacc${M}x${ABC[N:N+4]}); c${M} += 4;
[all …]

1234567891011