/external/XNNPACK/src/f32-hswish/ |
D | scalar.c.in | 39 float vacc${ABC[N]} = vx${ABC[N]} * vsixth + vhalf; 42 vacc${ABC[N]} = ${MAX_F32}(vacc${ABC[N]}, 0.0f); 45 vacc${ABC[N]} = ${MIN_F32}(vacc${ABC[N]}, vone); 48 vacc${ABC[N]} *= vx${ABC[N]}; 51 y[${N}] = vacc${ABC[N]}; 58 float vacc = vx * vsixth + vhalf; 59 vacc = ${MAX_F32}(vacc, 0.0f); 60 vacc = ${MIN_F32}(vacc, vone); 61 vacc = vacc * vx; 62 *y++ = vacc; [all …]
|
D | avx.c.in | 42 __m256 vacc${ABC[N:N+8]} = _mm256_fmadd_ps(vx${ABC[N:N+8]}, vsixth, vhalf); 45 __m256 vacc${ABC[N:N+8]} = _mm256_mul_ps(vx${ABC[N:N+8]}, vsixth); 48 vacc${ABC[N:N+8]} = _mm256_add_ps(vacc${ABC[N:N+8]}, vhalf); 51 vacc${ABC[N:N+8]} = _mm256_max_ps(vacc${ABC[N:N+8]}, vzero); 54 vacc${ABC[N:N+8]} = _mm256_min_ps(vacc${ABC[N:N+8]}, vone); 57 vacc${ABC[N:N+8]} = _mm256_mul_ps(vacc${ABC[N:N+8]}, vx${ABC[N:N+8]}); 59 _mm256_storeu_ps(y, vacc${ABC[0:8]}); 61 _mm256_storeu_ps(y + ${N}, vacc${ABC[N:N+8]}); 69 __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf); 71 __m256 vacc = _mm256_mul_ps(vx, vsixth); [all …]
|
D | avx512f.c.in | 39 __m512 vacc${ABC[N:N+16]} = _mm512_fmadd_ps(vx${ABC[N:N+16]}, vsixth, vhalf); 42 vacc${ABC[N:N+16]} = _mm512_max_ps(vacc${ABC[N:N+16]}, vzero); 45 vacc${ABC[N:N+16]} = _mm512_min_ps(vacc${ABC[N:N+16]}, vone); 48 vacc${ABC[N:N+16]} = _mm512_mul_ps(vacc${ABC[N:N+16]}, vx${ABC[N:N+16]}); 50 _mm512_storeu_ps(y, vacc${ABC[0:16]}); 52 _mm512_storeu_ps(y + ${N}, vacc${ABC[N:N+16]}); 59 __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf); variable 60 vacc = _mm512_max_ps(vacc, vzero); 61 vacc = _mm512_min_ps(vacc, vone); 62 vacc = _mm512_mul_ps(vacc, vx); [all …]
|
/external/XNNPACK/src/f32-hswish/gen/ |
D | avx-x8.c | 54 __m256 vacc = _mm256_mul_ps(vx, vsixth); in xnn_f32_hswish_ukernel__avx_x8() local 55 vacc = _mm256_add_ps(vacc, vhalf); in xnn_f32_hswish_ukernel__avx_x8() 56 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x8() 57 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx_x8() 58 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx_x8() 59 _mm256_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__avx_x8() 68 __m256 vacc = _mm256_mul_ps(vx, vsixth); in xnn_f32_hswish_ukernel__avx_x8() local 69 vacc = _mm256_add_ps(vacc, vhalf); in xnn_f32_hswish_ukernel__avx_x8() 70 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x8() 71 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx_x8() [all …]
|
D | avx-x16.c | 61 __m256 vacc = _mm256_mul_ps(vx, vsixth); in xnn_f32_hswish_ukernel__avx_x16() local 62 vacc = _mm256_add_ps(vacc, vhalf); in xnn_f32_hswish_ukernel__avx_x16() 63 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x16() 64 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx_x16() 65 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx_x16() 66 _mm256_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__avx_x16() 75 __m256 vacc = _mm256_mul_ps(vx, vsixth); in xnn_f32_hswish_ukernel__avx_x16() local 76 vacc = _mm256_add_ps(vacc, vhalf); in xnn_f32_hswish_ukernel__avx_x16() 77 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x16() 78 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx_x16() [all …]
|
D | avx512f-x16.c | 51 __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__avx512f_x16() local 52 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x16() 53 vacc = _mm512_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx512f_x16() 54 vacc = _mm512_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx512f_x16() 55 _mm512_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__avx512f_x16() 66 __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__avx512f_x16() local 67 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x16() 68 vacc = _mm512_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx512f_x16() 69 vacc = _mm512_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx512f_x16() 70 _mm512_mask_storeu_ps(y, vmask, vacc); in xnn_f32_hswish_ukernel__avx512f_x16()
|
D | fma3-x8.c | 52 __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__fma3_x8() local 53 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x8() 54 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__fma3_x8() 55 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__fma3_x8() 56 _mm256_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__fma3_x8() 65 __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__fma3_x8() local 66 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x8() 67 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__fma3_x8() 68 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__fma3_x8() 71 __m128 vacc_lo = _mm256_castps256_ps128(vacc); in xnn_f32_hswish_ukernel__fma3_x8() [all …]
|
D | fma3-x16.c | 58 __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__fma3_x16() local 59 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x16() 60 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__fma3_x16() 61 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__fma3_x16() 62 _mm256_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__fma3_x16() 71 __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__fma3_x16() local 72 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x16() 73 vacc = _mm256_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__fma3_x16() 74 vacc = _mm256_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__fma3_x16() 77 __m128 vacc_lo = _mm256_castps256_ps128(vacc); in xnn_f32_hswish_ukernel__fma3_x16() [all …]
|
D | avx512f-x32.c | 57 __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__avx512f_x32() local 58 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x32() 59 vacc = _mm512_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx512f_x32() 60 vacc = _mm512_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx512f_x32() 61 _mm512_storeu_ps(y, vacc); in xnn_f32_hswish_ukernel__avx512f_x32() 72 __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf); in xnn_f32_hswish_ukernel__avx512f_x32() local 73 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x32() 74 vacc = _mm512_min_ps(vacc, vone); in xnn_f32_hswish_ukernel__avx512f_x32() 75 vacc = _mm512_mul_ps(vacc, vx); in xnn_f32_hswish_ukernel__avx512f_x32() 76 _mm512_mask_storeu_ps(y, vmask, vacc); in xnn_f32_hswish_ukernel__avx512f_x32()
|
/external/XNNPACK/src/f32-dwconv/ |
D | up-avx512.c.in | 42 __m512 vacc${ABC[0:16]}p0 = _mm512_load_ps(w); 44 __m512 vacc${ABC[C:C+16]}p0 = _mm512_load_ps(w + ${C}); 57 … __m512 vacc${ABC[C:C+16]}p${K} = _mm512_mul_ps(vi${K}x${ABC[C:C+16]}, vk${K}x${ABC[C:C+16]}); 59 …vacc${ABC[C:C+16]}p${K % ACCUMULATORS} = _mm512_fmadd_ps(vi${K}x${ABC[C:C+16]}, vk${K}x${ABC[C:C+1… 64 // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0 70 …vacc${ABC[C:C+16]}p${A} = _mm512_add_ps(vacc${ABC[C:C+16]}p${A}, vacc${ABC[C:C+16]}p${A + ACC_SLIC… 74 __m512 vacc${ABC[C:C+16]} = _mm512_max_ps(vacc${ABC[C:C+16]}p0, vmin); 76 vacc${ABC[C:C+16]} = _mm512_min_ps(vacc${ABC[C:C+16]}, vmax); 78 _mm512_storeu_ps(output, vacc${ABC[0:16]}); 80 _mm512_storeu_ps(output + ${C}, vacc${ABC[C:C+16]}); [all …]
|
/external/XNNPACK/src/f32-vmulcaddc/ |
D | neon.c.in | 62 float32x4_t vacc${M}x${ABC[C:C+4]} = vld1q_f32(i${M}); i${M} += 4; 67 vacc${M}x${ABC[C:C+4]} = vmulq_f32(vacc${M}x${ABC[C:C+4]}, vscale${ABC[C:C+4]}); 75 vacc${M}x${ABC[C:C+4]} = vaddq_f32(vacc${M}x${ABC[C:C+4]}, vbias${ABC[C:C+4]}); 79 …vacc${M}x${ABC[C:C+4]} = vfmaq_f32(vbias${ABC[C:C+4]}, vscale${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]}… 83 vacc${M}x${ABC[C:C+4]} = vmaxq_f32(vacc${M}x${ABC[C:C+4]}, vmin); 87 vacc${M}x${ABC[C:C+4]} = vminq_f32(vacc${M}x${ABC[C:C+4]}, vmax); 91 vst1q_f32(o${M}, vacc${M}x${ABC[C:C+4]}); o${M} += 4; 98 float32x4_t vacc${M}x0123 = vld1q_f32(i${M}); i${M} += 4; 102 vacc${M}x0123 = vmulq_f32(vacc${M}x0123, vscale0123); 108 vacc${M}x0123 = vaddq_f32(vacc${M}x0123, vbias0123); [all …]
|
D | sse.c.in | 62 __m128 vacc${M}x${ABC[0:4]} = _mm_loadu_ps(i${M}); 64 __m128 vacc${M}x${ABC[C:C+4]} = _mm_loadu_ps(i${M} + ${C}); 69 vacc${M}x${ABC[C:C+4]} = _mm_mul_ps(vacc${M}x${ABC[C:C+4]}, vscale${ABC[C:C+4]}); 76 vacc${M}x${ABC[C:C+4]} = _mm_add_ps(vacc${M}x${ABC[C:C+4]}, vbias${ABC[C:C+4]}); 80 vacc${M}x${ABC[C:C+4]} = _mm_max_ps(vacc${M}x${ABC[C:C+4]}, vmin); 84 vacc${M}x${ABC[C:C+4]} = _mm_min_ps(vacc${M}x${ABC[C:C+4]}, vmax); 87 _mm_storeu_ps(o${M}, vacc${M}x${ABC[0:4]}); 89 _mm_storeu_ps(o${M} + ${C}, vacc${M}x${ABC[C:C+4]}); 99 __m128 vacc${M}x0123 = _mm_loadu_ps(i${M}); 103 vacc${M}x0123 = _mm_mul_ps(vacc${M}x0123, vscale0123); [all …]
|
D | psimd.c.in | 62 psimd_f32 vacc${M}x${ABC[0:4]} = psimd_load_f32(i${M}); 64 psimd_f32 vacc${M}x${ABC[C:C+4]} = psimd_load_f32(i${M} + ${C}); 72 …vacc${M}x${ABC[C:C+4]} = psimd_qfma_f32(vbias${ABC[C:C+4]}, vscale${ABC[C:C+4]}, vacc${M}x${ABC[C:… 76 vacc${M}x${ABC[C:C+4]} = psimd_max_f32(vacc${M}x${ABC[C:C+4]}, vmin); 80 vacc${M}x${ABC[C:C+4]} = psimd_min_f32(vacc${M}x${ABC[C:C+4]}, vmax); 83 psimd_store_f32(o${M}, vacc${M}x${ABC[0:4]}); 85 psimd_store_f32(o${M} + ${C}, vacc${M}x${ABC[C:C+4]}); 95 psimd_f32 vacc${M}x0123 = psimd_load_f32(i${M}); 101 vacc${M}x0123 = psimd_qfma_f32(vbias0123, vscale0123, vacc${M}x0123); 104 vacc${M}x0123 = psimd_max_f32(vacc${M}x0123, vmin); [all …]
|
D | scalar.c.in | 62 float vacc${M}x${ABC[C]} = i${M}[${C}]; 70 vacc${M}x${ABC[C]} = vacc${M}x${ABC[C]} * vscale${ABC[C]} + vbias${ABC[C]}; 74 vacc${M}x${ABC[C]} = ${MAX_F32}(vacc${M}x${ABC[C]}, vmin); 78 vacc${M}x${ABC[C]} = ${MIN_F32}(vacc${M}x${ABC[C]}, vmax); 82 o${M}[${C}] = vacc${M}x${ABC[C]}; 92 float vacc${M} = *i${M}++; 97 vacc${M} = vacc${M} * vscale + vbias; 100 vacc${M} = ${MAX_F32}(vacc${M}, vmin); 103 vacc${M} = ${MIN_F32}(vacc${M}, vmax); 106 *o${M}++ = vacc${M}; [all …]
|
/external/XNNPACK/src/f32-igemm/ |
D | MRx2c4-psimd.c.in | 63 psimd_f32 vacc${M}x${N}c4 = vacc0x${N}c4; 89 vacc${M}x${N}c4 = psimd_qfma_f32(vacc${M}x${N}c4, va${M}, vb${N}); 106 … vacc${M}x${N}c4 = psimd_qfma_f32(vacc${M}x${N}c4, psimd_andmask_f32(vmask${N}, va${M}), vb${N}); 112 …nst psimd_f32 vacc${M}x01c2 = psimd_add_f32(psimd_interleave_lo_f32(vacc${M}x0c4, vacc${M}x1c4), p… 115 …psimd_f32 vacc${M}${M+1}x01 = psimd_add_f32(psimd_concat_lo_f32(vacc${M}x01c2, vacc${M+1}x01c2), p… 119 vacc${M}${M+1}x01 = psimd_min_f32(vacc${M}${M+1}x01, vmax); 123 vacc${M}${M+1}x01 = psimd_max_f32(vacc${M}${M+1}x01, vmin); 127 psimd_store2_f32(c${M+1}, psimd_concat_hi_f32(vacc${M}${M+1}x01, vacc${M}${M+1}x01)); 129 psimd_store2_f32(c${M}, vacc${M}${M+1}x01); 137 psimd_store1_f32(c${M+1}, psimd_concat_hi_f32(vacc${M}${M+1}x01, vacc${M}${M+1}x01)); [all …]
|
D | MRx2c4-sse.c.in | 63 __m128 vacc${M}x${N}c4 = vacc0x${N}c4; 89 vacc${M}x${N}c4 = _mm_add_ps(vacc${M}x${N}c4, _mm_mul_ps(va${M}, vb${N})); 105 …vacc${M}x${N}c4 = _mm_add_ps(vacc${M}x${N}c4, _mm_mul_ps(_mm_andnot_ps(vmask${N}, va${M}), vb${N})… 111 …const __m128 vacc${M}x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc${M}x0c4, vacc${M}x1c4), _mm_unpackhi_… 114 …__m128 vacc${M}${M+1}x01 = _mm_add_ps(_mm_movelh_ps(vacc${M}x01c2, vacc${M+1}x01c2), _mm_movehl_ps… 118 vacc${M}${M+1}x01 = _mm_min_ps(vacc${M}${M+1}x01, vmax); 122 vacc${M}${M+1}x01 = _mm_max_ps(vacc${M}${M+1}x01, vmin); 126 _mm_storeh_pi((__m64*) c${M+1}, vacc${M}${M+1}x01); 128 _mm_storel_pi((__m64*) c${M}, vacc${M}${M+1}x01); 136 _mm_store_ss(c${M+1}, _mm_movehl_ps(vacc${M}${M+1}x01, vacc${M}${M+1}x01)); [all …]
|
D | avx-shuffle4.c.in | 64 __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]}; 91 … vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}c${L}, vacc${M}x${ABC[N:N+8]}); 93 …vacc${M}x${ABC[N:N+8]} = _mm_add_ps(vacc${M}x${ABC[N:N+8]}, _mm_mul_ps(va${M}, vb${ABC[N:N+8]}c${L… 116 … vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]}); 118 …vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8… 129 vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax); 134 vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin); 138 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]}); 140 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]}); 151 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]}); [all …]
|
D | avx-broadcast.c.in | 64 __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]}; 91 … vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]}); 93 …vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8… 102 vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax); 107 vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin); 111 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]}); 113 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]}); 124 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]}); 126 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]}); 130 vacc${M}x${ABC[N:N+8]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+8]}; [all …]
|
/external/XNNPACK/src/f32-prelu/ |
D | psimd.c.in | 70 psimd_f32 vacc${M}x${ABC[C:C+4]} = psimd_mul_f32(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]}); 74 …vacc${M}x${ABC[C:C+4]} = psimd_signblend_f32(vi${M}x${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]}, vi${M}x… 78 vacc${M}x${ABC[C:C+4]} = psimd_max_f32(vacc${M}x${ABC[C:C+4]}, vmin); 82 vacc${M}x${ABC[C:C+4]} = psimd_min_f32(vacc${M}x${ABC[C:C+4]}, vmax); 85 psimd_store_f32(o${M}, vacc${M}x${ABC[0:4]}); 87 psimd_store_f32(o${M} + ${C}, vacc${M}x${ABC[C:C+4]}); 100 psimd_f32 vacc${M}x0123 = psimd_mul_f32(vi${M}x0123, vw0123); 103 vacc${M}x0123 = psimd_signblend_f32(vi${M}x0123, vacc${M}x0123, vi${M}x0123); 106 vacc${M}x0123 = psimd_max_f32(vacc${M}x0123, vmin); 109 vacc${M}x0123 = psimd_min_f32(vacc${M}x0123, vmax); [all …]
|
D | neon.c.in | 66 float32x4_t vacc${M}x${ABC[C:C+4]} = vmulq_f32(vi${M}x${ABC[C:C+4]}, vw${ABC[C:C+4]}); 71 …vacc${M}x${ABC[C:C+4]} = vbslq_f32(vm${M}x${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]}, vi${M}x${ABC[C:C+… 75 vacc${M}x${ABC[C:C+4]} = vmaxq_f32(vacc${M}x${ABC[C:C+4]}, vmin); 79 vacc${M}x${ABC[C:C+4]} = vminq_f32(vacc${M}x${ABC[C:C+4]}, vmax); 83 vst1q_f32(o${M}, vacc${M}x${ABC[C:C+4]}); o${M} += 4; 94 float32x4_t vacc${M}x0123 = vmulq_f32(vi${M}x0123, vw0123); 98 vacc${M}x0123 = vbslq_f32(vm${M}x0123, vacc${M}x0123, vi${M}x0123); 101 vacc${M}x0123 = vmaxq_f32(vacc${M}x0123, vmin); 104 vacc${M}x0123 = vminq_f32(vacc${M}x0123, vmax); 107 vst1q_f32(o${M}, vacc${M}x0123); o${M} += 4; [all …]
|
/external/XNNPACK/src/f32-gemm/ |
D | avx-shuffle4.c.in | 66 __m256 vacc${M}x${ABC[N:N+8]} = _mm256_load_ps(acc + ${M*NR+N}); 73 __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]}; 90 … vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}c${L}, vacc${M}x${ABC[N:N+8]}); 92 …vacc${M}x${ABC[N:N+8]} = _mm_add_ps(vacc${M}x${ABC[N:N+8]}, _mm_mul_ps(va${M}, vb${ABC[N:N+8]}c${L… 115 … vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]}); 117 …vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8… 126 vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax); 131 vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin); 135 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]}); 137 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]}); [all …]
|
D | avx-broadcast.c.in | 66 __m256 vacc${M}x${ABC[N:N+8]} = _mm256_load_ps(acc + ${M*NR+N}); 73 __m256 vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]}; 90 … vacc${M}x${ABC[N:N+8]} = _mm256_fmadd_ps(va${M}, vb${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]}); 92 …vacc${M}x${ABC[N:N+8]} = _mm256_add_ps(vacc${M}x${ABC[N:N+8]}, _mm256_mul_ps(va${M}, vb${ABC[N:N+8… 100 vacc${M}x${ABC[N:N+8]} = _mm256_min_ps(vacc${M}x${ABC[N:N+8]}, vmax); 105 vacc${M}x${ABC[N:N+8]} = _mm256_max_ps(vacc${M}x${ABC[N:N+8]}, vmin); 109 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]}); 111 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]}); 124 _mm256_storeu_ps(c${M}, vacc${M}x${ABC[0:8]}); 126 _mm256_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+8]}); [all …]
|
/external/XNNPACK/src/f16-gemm/ |
D | neonfp16arith-ld64.c.in | 63 float16x8_t vacc${M}x${ABC[N:N+8]} = vacc0x${ABC[N:N+8]}; 76 …vacc${M}x${ABC[N:N+8]} = vfmaq_lane_f16(vacc${M}x${ABC[N:N+8]}, vb${ABC[N:N+8]}c${L}, va${M}, ${L}… 83 … vacc${M}x${ABC[N:N+8]} = vfmaq_f16(vacc${M}x${ABC[N:N+8]}, va${M}c${L}, vb${ABC[N:N+8]}c${L}); 98 vacc${M}x${ABC[N:N+8]} = vfmaq_f16(vacc${M}x${ABC[N:N+8]}, va${M}, vb${ABC[N:N+8]}); 107 vacc${M}x${ABC[N:N+8]} = vmulq_f16(vacc${M}x${ABC[N:N+8]}, vscale); 112 vacc${M}x${ABC[N:N+8]} = vminq_f16(vacc${M}x${ABC[N:N+8]}, vmax); 117 vacc${M}x${ABC[N:N+8]} = vmaxq_f16(vacc${M}x${ABC[N:N+8]}, vmin); 121 vst1q_f16(c${M}, vacc${M}x${ABC[0:8]}); 123 vst1q_f16(c${M} + ${N}, vacc${M}x${ABC[N:N+8]}); 137 vst1q_f16(c${M}, vacc${M}x${ABC[N:N+8]}); c${M} += 8; [all …]
|
/external/XNNPACK/src/f32-spmm/ |
D | neon-blocked.c.in | 39 float32x4_t vacc${ABC[0:4]}c${N} = vld1q_dup_f32(w); w += 1; 41 float32x4_t vacc${ABC[M:M+4]}c${N} = vacc${ABC[0:4]}c${N}; 60 vacc${ABC[M:M+4]}c0 = vfmaq_f32(vacc${ABC[M:M+4]}c0, va${ABC[M:M+4]}, vb); 64 …vacc${ABC[M:M+4]}c${N} = vfmaq_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[M:M+4]}c${N}, va${ABC[M… 69 float32x4_t vout${ABC[M:M+4]}c${N} = vminq_f32(vacc${ABC[M:M+4]}c${N}, vmax); 86 float32x4_t vacc${ABC[0:4]} = vld1q_dup_f32(w); w += 1; 88 float32x4_t vacc${ABC[M:M+4]} = vacc${ABC[0:4]}; 98 vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, va${ABC[M:M+4]}, vb); 102 float32x4_t vout${ABC[M:M+4]} = vminq_f32(vacc${ABC[M:M+4]}, vmax); 130 float32x2_t vacc${ABC[0:SUBMR]}c${N} = vld1_dup_f32(w); w += 1; [all …]
|
/external/XNNPACK/src/f32-ppmm/ |
D | neon.c.in | 55 float32x4_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]}; 69 …vacc${M}x${ABC[N:N+4]} = vfmaq_laneq_f32(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}, va${ABC[M&-4:4+M… 79 … vacc${M}x${ABC[N:N+4]} = vfmaq_f32(vacc${M}x${ABC[N:N+4]}, va${MMMM}, vb${ABC[N:N+4]}); 85 …vacc${M}x${ABC[N:N+4]} = vmlaq_lane_f32(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}, ${VGET_PART_F32}(… 93 vacc${M}x${ABC[N:N+4]} = vminq_f32(vacc${M}x${ABC[N:N+4]}, vmax); 98 vacc${M}x${ABC[N:N+4]} = vmaxq_f32(vacc${M}x${ABC[N:N+4]}, vmin); 102 vst1q_f32(c${M}, vacc${M}x${ABC[0:4]}); 104 vst1q_f32(c${M} + ${N}, vacc${M}x${ABC[N:N+4]}); 115 float32x2_t vacc${M}x01 = vget_low_f32(vacc${M}x0123); 120 vst1q_f32(c${M}, vacc${M}x${ABC[N:N+4]}); c${M} += 4; [all …]
|