/external/XNNPACK/src/f32-ibilinear-chw/gen/ |
D | wasmsimd-p4.c | 60 const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4() local 76 const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4() 77 const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4() 92 const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4() local 110 const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4() 111 const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4() 134 const v128_t valphav = wasm_v32x4_load_splat(w + 1); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4() local 148 const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4()
|
D | neonfma-p4.c | 59 const float32x4_t valphav = vw.val[1]; in xnn_f32_ibilinear_chw_ukernel__neonfma_p4() local 77 const float32x4_t vl = vfmaq_f32(vtl, vld, valphav); in xnn_f32_ibilinear_chw_ukernel__neonfma_p4() 78 const float32x4_t vr = vfmaq_f32(vtr, vrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neonfma_p4() 93 const float32x2_t valphav = vw.val[1]; in xnn_f32_ibilinear_chw_ukernel__neonfma_p4() local 117 const float32x2_t vl = vfma_f32(vtl, vld, valphav); in xnn_f32_ibilinear_chw_ukernel__neonfma_p4() 118 const float32x2_t vr = vfma_f32(vtr, vrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neonfma_p4() 141 const float32x2_t valphav = vld1_dup_f32(w + 1); in xnn_f32_ibilinear_chw_ukernel__neonfma_p4() local 155 const float32x2_t vlr = vfma_f32(vtltr, vldrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neonfma_p4()
|
D | neon-p4.c | 59 const float32x4_t valphav = vw.val[1]; in xnn_f32_ibilinear_chw_ukernel__neon_p4() local 77 const float32x4_t vl = vmlaq_f32(vtl, vld, valphav); in xnn_f32_ibilinear_chw_ukernel__neon_p4() 78 const float32x4_t vr = vmlaq_f32(vtr, vrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neon_p4() 93 const float32x2_t valphav = vw.val[1]; in xnn_f32_ibilinear_chw_ukernel__neon_p4() local 117 const float32x2_t vl = vmla_f32(vtl, vld, valphav); in xnn_f32_ibilinear_chw_ukernel__neon_p4() 118 const float32x2_t vr = vmla_f32(vtr, vrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neon_p4() 141 const float32x2_t valphav = vld1_dup_f32(w + 1); in xnn_f32_ibilinear_chw_ukernel__neon_p4() local 155 const float32x2_t vlr = vmla_f32(vtltr, vldrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neon_p4()
|
D | wasmsimd-p8.c | 146 const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8() local 162 const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8() 163 const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8() 178 const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8() local 196 const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8() 197 const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8() 220 const v128_t valphav = wasm_v32x4_load_splat(w + 1); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8() local 234 const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8()
|
D | neonfma-p8.c | 147 const float32x4_t valphav = vw.val[1]; in xnn_f32_ibilinear_chw_ukernel__neonfma_p8() local 165 const float32x4_t vl = vfmaq_f32(vtl, vld, valphav); in xnn_f32_ibilinear_chw_ukernel__neonfma_p8() 166 const float32x4_t vr = vfmaq_f32(vtr, vrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neonfma_p8() 181 const float32x2_t valphav = vw.val[1]; in xnn_f32_ibilinear_chw_ukernel__neonfma_p8() local 205 const float32x2_t vl = vfma_f32(vtl, vld, valphav); in xnn_f32_ibilinear_chw_ukernel__neonfma_p8() 206 const float32x2_t vr = vfma_f32(vtr, vrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neonfma_p8() 229 const float32x2_t valphav = vld1_dup_f32(w + 1); in xnn_f32_ibilinear_chw_ukernel__neonfma_p8() local 243 const float32x2_t vlr = vfma_f32(vtltr, vldrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neonfma_p8()
|
D | neon-p8.c | 147 const float32x4_t valphav = vw.val[1]; in xnn_f32_ibilinear_chw_ukernel__neon_p8() local 165 const float32x4_t vl = vmlaq_f32(vtl, vld, valphav); in xnn_f32_ibilinear_chw_ukernel__neon_p8() 166 const float32x4_t vr = vmlaq_f32(vtr, vrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neon_p8() 181 const float32x2_t valphav = vw.val[1]; in xnn_f32_ibilinear_chw_ukernel__neon_p8() local 205 const float32x2_t vl = vmla_f32(vtl, vld, valphav); in xnn_f32_ibilinear_chw_ukernel__neon_p8() 206 const float32x2_t vr = vmla_f32(vtr, vrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neon_p8() 229 const float32x2_t valphav = vld1_dup_f32(w + 1); in xnn_f32_ibilinear_chw_ukernel__neon_p8() local 243 const float32x2_t vlr = vmla_f32(vtltr, vldrd, valphav); in xnn_f32_ibilinear_chw_ukernel__neon_p8()
|
D | scalar-p1.c | 40 const float valphav = w[1]; in xnn_f32_ibilinear_chw_ukernel__scalar_p1() local 56 const float vo = vt + vd * valphav; in xnn_f32_ibilinear_chw_ukernel__scalar_p1()
|
D | scalar-p2.c | 83 const float valphav = w[1]; in xnn_f32_ibilinear_chw_ukernel__scalar_p2() local 99 const float vo = vt + vd * valphav; in xnn_f32_ibilinear_chw_ukernel__scalar_p2()
|
D | scalar-p4.c | 113 const float valphav = w[1]; in xnn_f32_ibilinear_chw_ukernel__scalar_p4() local 129 const float vo = vt + vd * valphav; in xnn_f32_ibilinear_chw_ukernel__scalar_p4()
|
/external/XNNPACK/src/f32-ibilinear/gen/ |
D | scalar-c4.c | 36 const float valphav = weights[1]; in xnn_f32_ibilinear_ukernel__scalar_c4() local 85 const float vo0 = vt0 + vd0 * valphav; in xnn_f32_ibilinear_ukernel__scalar_c4() 86 const float vo1 = vt1 + vd1 * valphav; in xnn_f32_ibilinear_ukernel__scalar_c4() 87 const float vo2 = vt2 + vd2 * valphav; in xnn_f32_ibilinear_ukernel__scalar_c4() 88 const float vo3 = vt3 + vd3 * valphav; in xnn_f32_ibilinear_ukernel__scalar_c4() 110 const float vo = vt + vd * valphav; in xnn_f32_ibilinear_ukernel__scalar_c4()
|
D | scalar-c2.c | 36 const float valphav = weights[1]; in xnn_f32_ibilinear_ukernel__scalar_c2() local 67 const float vo0 = vt0 + vd0 * valphav; in xnn_f32_ibilinear_ukernel__scalar_c2() 68 const float vo1 = vt1 + vd1 * valphav; in xnn_f32_ibilinear_ukernel__scalar_c2() 88 const float vo = vt + vd * valphav; in xnn_f32_ibilinear_ukernel__scalar_c2()
|
D | wasmsimd-c8.c | 38 const v128_t valphav = wasm_v32x4_load_splat(weights + 1); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() local 69 const v128_t vo0123 = wasm_f32x4_add(vt0123, wasm_f32x4_mul(vd0123, valphav)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() 70 const v128_t vo4567 = wasm_f32x4_add(vt4567, wasm_f32x4_mul(vd4567, valphav)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() 91 const v128_t vo = wasm_f32x4_add(vt, wasm_f32x4_mul(vd, valphav)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() 107 v128_t vo = wasm_f32x4_add(vt, wasm_f32x4_mul(vd, valphav)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8()
|
D | sse-c8.c | 40 const __m128 valphav = _mm_movehl_ps(valphahv, valphahv); in xnn_f32_ibilinear_ukernel__sse_c8() local 71 const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav)); in xnn_f32_ibilinear_ukernel__sse_c8() 72 const __m128 vo4567 = _mm_add_ps(vt4567, _mm_mul_ps(vd4567, valphav)); in xnn_f32_ibilinear_ukernel__sse_c8() 96 const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav)); in xnn_f32_ibilinear_ukernel__sse_c8() 115 __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav)); in xnn_f32_ibilinear_ukernel__sse_c8()
|
D | neonfma-c8.c | 41 const float32x4_t valphav = vdupq_lane_f32(valphahv, 1); in xnn_f32_ibilinear_ukernel__neonfma_c8() local 76 const float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); in xnn_f32_ibilinear_ukernel__neonfma_c8() 77 const float32x4_t vo4567 = vfmaq_f32(vt4567, vd4567, valphav); in xnn_f32_ibilinear_ukernel__neonfma_c8() 106 const float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); in xnn_f32_ibilinear_ukernel__neonfma_c8() 134 float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); in xnn_f32_ibilinear_ukernel__neonfma_c8()
|
D | wasmsimd-c4.c | 38 const v128_t valphav = wasm_v32x4_load_splat(weights + 1); in xnn_f32_ibilinear_ukernel__wasmsimd_c4() local 57 const v128_t vo = wasm_f32x4_add(vt, wasm_f32x4_mul(vd, valphav)); in xnn_f32_ibilinear_ukernel__wasmsimd_c4() 73 v128_t vo = wasm_f32x4_add(vt, wasm_f32x4_mul(vd, valphav)); in xnn_f32_ibilinear_ukernel__wasmsimd_c4()
|
D | sse-c4.c | 40 const __m128 valphav = _mm_movehl_ps(valphahv, valphahv); in xnn_f32_ibilinear_ukernel__sse_c4() local 62 const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav)); in xnn_f32_ibilinear_ukernel__sse_c4() 81 __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav)); in xnn_f32_ibilinear_ukernel__sse_c4()
|
D | neonfma-c4.c | 41 const float32x4_t valphav = vdupq_lane_f32(valphahv, 1); in xnn_f32_ibilinear_ukernel__neonfma_c4() local 65 const float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); in xnn_f32_ibilinear_ukernel__neonfma_c4() 92 float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); in xnn_f32_ibilinear_ukernel__neonfma_c4()
|
D | scalar-c1.c | 36 const float valphav = weights[1]; in xnn_f32_ibilinear_ukernel__scalar_c1() local 54 const float vo = vt + vd * valphav; in xnn_f32_ibilinear_ukernel__scalar_c1()
|
/external/XNNPACK/src/f32-ibilinear-chw/ |
D | wasmsimd.c.in | 53 …const v128_t valphav${ABC[P:P+4]} = wasm_v32x4_shuffle(vw${ABC[P:P+4]}p0, vw${ABC[P:P+4]}p1, 1, 3,… 71 …P:P+4]} = wasm_f32x4_add(vtl${ABC[P:P+4]}, wasm_f32x4_mul(vld${ABC[P:P+4]}, valphav${ABC[P:P+4]})); 72 …P:P+4]} = wasm_f32x4_add(vtr${ABC[P:P+4]}, wasm_f32x4_mul(vrd${ABC[P:P+4]}, valphav${ABC[P:P+4]})); 102 const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7); variable 117 const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); 118 const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); 133 const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3); variable 150 const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); 151 const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); 174 const v128_t valphav = wasm_v32x4_load_splat(w + 1); variable [all …]
|
D | neon.c.in | 52 const float32x4_t valphav${ABC[P:P+4]} = vw${ABC[P:P+4]}.val[1]; 72 …32x4_t vl${ABC[P:P+4]} = ${VMULADDQ_F32}(vtl${ABC[P:P+4]}, vld${ABC[P:P+4]}, valphav${ABC[P:P+4]}); 73 …32x4_t vr${ABC[P:P+4]} = ${VMULADDQ_F32}(vtr${ABC[P:P+4]}, vrd${ABC[P:P+4]}, valphav${ABC[P:P+4]}); 100 const float32x4_t valphav = vw.val[1]; 117 const float32x4_t vl = ${VMULADDQ_F32}(vtl, vld, valphav); 118 const float32x4_t vr = ${VMULADDQ_F32}(vtr, vrd, valphav); 133 const float32x2_t valphav = vw.val[1]; 155 const float32x2_t vl = ${VMULADD_F32}(vtl, vld, valphav); 156 const float32x2_t vr = ${VMULADD_F32}(vtr, vrd, valphav); 179 const float32x2_t valphav = vld1_dup_f32(w + 1); [all …]
|
D | scalar.c.in | 41 const float valphav${ABC[P]} = w[${P * 2 + 1}]; 62 const float vo${ABC[P]} = vt${ABC[P]} + vd${ABC[P]} * valphav${ABC[P]}; 75 const float valphav = w[1]; variable 91 const float vo = vt + vd * valphav; 102 const float valphav = w[1]; variable 118 const float vo = vt + vd * valphav;
|
/external/XNNPACK/src/f32-ibilinear/ |
D | scalar.c.in | 35 const float valphav = weights[1]; 63 const float vo${ABC[C]} = vt${ABC[C]} + vd${ABC[C]} * valphav; 83 const float vo = vt + vd * valphav; 102 const float vo = vt + vd * valphav;
|
D | sse.c.in | 40 const __m128 valphav = _mm_movehl_ps(valphahv, valphahv); 71 … const __m128 vo${ABC[C:C+4]} = _mm_add_ps(vt${ABC[C:C+4]}, _mm_mul_ps(vd${ABC[C:C+4]}, valphav)); 97 const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav)); 116 __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
|
D | wasmsimd.c.in | 38 const v128_t valphav = wasm_v32x4_load_splat(weights + 1); 70 …v128_t vo${ABC[C:C+4]} = wasm_f32x4_add(vt${ABC[C:C+4]}, wasm_f32x4_mul(vd${ABC[C:C+4]}, valphav)); 92 const v128_t vo = wasm_f32x4_add(vt, wasm_f32x4_mul(vd, valphav)); 108 v128_t vo = wasm_f32x4_add(vt, wasm_f32x4_mul(vd, valphav));
|
D | neon.c.in | 43 const float32x4_t valphav = vdupq_lane_f32(valphahv, 1); 79 const float32x4_t vo${ABC[C:C+4]} = vfmaq_f32(vt${ABC[C:C+4]}, vd${ABC[C:C+4]}, valphav); 117 const float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); 152 float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav);
|