/external/XNNPACK/src/f32-ibilinear/gen/ |
D | scalar-c4.c | 35 const float valphah = weights[0]; in xnn_f32_ibilinear_ukernel__scalar_c4() local 71 const float vt0 = vtl0 + vtd0 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c4() 72 const float vb0 = vbl0 + vbd0 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c4() 73 const float vt1 = vtl1 + vtd1 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c4() 74 const float vb1 = vbl1 + vbd1 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c4() 75 const float vt2 = vtl2 + vtd2 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c4() 76 const float vb2 = vbl2 + vbd2 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c4() 77 const float vt3 = vtl3 + vtd3 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c4() 78 const float vb3 = vbl3 + vbd3 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c4() 105 const float vt = vtl + vtd * valphah; in xnn_f32_ibilinear_ukernel__scalar_c4() [all …]
|
D | scalar-c2.c | 35 const float valphah = weights[0]; in xnn_f32_ibilinear_ukernel__scalar_c2() local 59 const float vt0 = vtl0 + vtd0 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c2() 60 const float vb0 = vbl0 + vbd0 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c2() 61 const float vt1 = vtl1 + vtd1 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c2() 62 const float vb1 = vbl1 + vbd1 * valphah; in xnn_f32_ibilinear_ukernel__scalar_c2() 83 const float vt = vtl + vtd * valphah; in xnn_f32_ibilinear_ukernel__scalar_c2() 84 const float vb = vbl + vbd * valphah; in xnn_f32_ibilinear_ukernel__scalar_c2()
|
D | wasmsimd-c8.c | 37 const v128_t valphah = wasm_v32x4_load_splat(weights); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() local 61 const v128_t vt0123 = wasm_f32x4_add(vtl0123, wasm_f32x4_mul(vtd0123, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() 62 const v128_t vb0123 = wasm_f32x4_add(vbl0123, wasm_f32x4_mul(vbd0123, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() 63 const v128_t vt4567 = wasm_f32x4_add(vtl4567, wasm_f32x4_mul(vtd4567, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() 64 const v128_t vb4567 = wasm_f32x4_add(vbl4567, wasm_f32x4_mul(vbd4567, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() 88 const v128_t vt = wasm_f32x4_add(vtl, wasm_f32x4_mul(vtd, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() 89 const v128_t vb = wasm_f32x4_add(vbl, wasm_f32x4_mul(vbd, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() 104 const v128_t vt = wasm_f32x4_add(vtl, wasm_f32x4_mul(vtd, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8() 105 const v128_t vb = wasm_f32x4_add(vbl, wasm_f32x4_mul(vbd, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c8()
|
D | sse-c8.c | 39 const __m128 valphah = _mm_movelh_ps(valphahv, valphahv); in xnn_f32_ibilinear_ukernel__sse_c8() local 63 const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah)); in xnn_f32_ibilinear_ukernel__sse_c8() 64 const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah)); in xnn_f32_ibilinear_ukernel__sse_c8() 65 const __m128 vt4567 = _mm_add_ps(vtl4567, _mm_mul_ps(vtd4567, valphah)); in xnn_f32_ibilinear_ukernel__sse_c8() 66 const __m128 vb4567 = _mm_add_ps(vbl4567, _mm_mul_ps(vbd4567, valphah)); in xnn_f32_ibilinear_ukernel__sse_c8() 91 const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah)); in xnn_f32_ibilinear_ukernel__sse_c8() 92 const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah)); in xnn_f32_ibilinear_ukernel__sse_c8() 110 const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah)); in xnn_f32_ibilinear_ukernel__sse_c8() 111 const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah)); in xnn_f32_ibilinear_ukernel__sse_c8()
|
D | neonfma-c8.c | 40 const float32x4_t valphah = vdupq_lane_f32(valphahv, 0); in xnn_f32_ibilinear_ukernel__neonfma_c8() local 61 const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8() 62 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8() 63 const float32x4_t vt4567 = vfmaq_f32(vtl4567, vtd4567, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8() 64 const float32x4_t vb4567 = vfmaq_f32(vbl4567, vbd4567, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8() 96 const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8() 97 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8() 124 const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8() 125 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8()
|
D | wasmsimd-c4.c | 37 const v128_t valphah = wasm_v32x4_load_splat(weights); in xnn_f32_ibilinear_ukernel__wasmsimd_c4() local 54 const v128_t vt = wasm_f32x4_add(vtl, wasm_f32x4_mul(vtd, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c4() 55 const v128_t vb = wasm_f32x4_add(vbl, wasm_f32x4_mul(vbd, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c4() 70 const v128_t vt = wasm_f32x4_add(vtl, wasm_f32x4_mul(vtd, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c4() 71 const v128_t vb = wasm_f32x4_add(vbl, wasm_f32x4_mul(vbd, valphah)); in xnn_f32_ibilinear_ukernel__wasmsimd_c4()
|
D | sse-c4.c | 39 const __m128 valphah = _mm_movelh_ps(valphahv, valphahv); in xnn_f32_ibilinear_ukernel__sse_c4() local 57 const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah)); in xnn_f32_ibilinear_ukernel__sse_c4() 58 const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah)); in xnn_f32_ibilinear_ukernel__sse_c4() 76 const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah)); in xnn_f32_ibilinear_ukernel__sse_c4() 77 const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah)); in xnn_f32_ibilinear_ukernel__sse_c4()
|
D | neonfma-c4.c | 40 const float32x4_t valphah = vdupq_lane_f32(valphahv, 0); in xnn_f32_ibilinear_ukernel__neonfma_c4() local 55 const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c4() 56 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c4() 82 const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c4() 83 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c4()
|
D | scalar-c1.c | 35 const float valphah = weights[0]; in xnn_f32_ibilinear_ukernel__scalar_c1() local 49 const float vt = vtl + vtd * valphah; in xnn_f32_ibilinear_ukernel__scalar_c1() 50 const float vb = vbl + vbd * valphah; in xnn_f32_ibilinear_ukernel__scalar_c1()
|
/external/XNNPACK/src/f32-ibilinear-chw/ |
D | scalar.c.in | 40 const float valphah${ABC[P]} = w[${P * 2}]; 55 const float vt${ABC[P]} = vtl${ABC[P]} + vtd${ABC[P]} * valphah${ABC[P]}; 56 const float vb${ABC[P]} = vbl${ABC[P]} + vbd${ABC[P]} * valphah${ABC[P]}; 74 const float valphah = w[0]; variable 86 const float vt = vtl + vtd * valphah; 87 const float vb = vbl + vbd * valphah; 101 const float valphah = w[0]; variable 113 const float vt = vtl + vtd * valphah; 114 const float vb = vbl + vbd * valphah;
|
D | wasmsimd.c.in | 52 …const v128_t valphah${ABC[P:P+4]} = wasm_v32x4_shuffle(vw${ABC[P:P+4]}p0, vw${ABC[P:P+4]}p1, 0, 2,… 78 …C[P:P+4]} = wasm_f32x4_add(vl${ABC[P:P+4]}, wasm_f32x4_mul(vd${ABC[P:P+4]}, valphah${ABC[P:P+4]})); 101 const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6); variable 121 const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); 132 const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2); variable 154 const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah));
|
D | neon.c.in | 51 const float32x4_t valphah${ABC[P:P+4]} = vw${ABC[P:P+4]}.val[0]; 79 …at32x4_t vo${ABC[P:P+4]} = ${VMULADDQ_F32}(vl${ABC[P:P+4]}, vd${ABC[P:P+4]}, valphah${ABC[P:P+4]}); 99 const float32x4_t valphah = vw.val[0]; 121 const float32x4_t vo = ${VMULADDQ_F32}(vl, vd, valphah); 132 const float32x2_t valphah = vw.val[0]; 159 const float32x2_t vo = ${VMULADD_F32}(vl, vd, valphah);
|
/external/XNNPACK/src/f32-ibilinear/ |
D | scalar.c.in | 34 const float valphah = weights[0]; 56 const float vt${ABC[C]} = vtl${ABC[C]} + vtd${ABC[C]} * valphah; 57 const float vb${ABC[C]} = vbl${ABC[C]} + vbd${ABC[C]} * valphah; 78 const float vt = vtl + vtd * valphah; 79 const float vb = vbl + vbd * valphah; 97 const float vt = vtl + vtd * valphah; 98 const float vb = vbl + vbd * valphah;
|
D | sse.c.in | 39 const __m128 valphah = _mm_movelh_ps(valphahv, valphahv); 64 …const __m128 vt${ABC[C:C+4]} = _mm_add_ps(vtl${ABC[C:C+4]}, _mm_mul_ps(vtd${ABC[C:C+4]}, valphah)); 65 …const __m128 vb${ABC[C:C+4]} = _mm_add_ps(vbl${ABC[C:C+4]}, _mm_mul_ps(vbd${ABC[C:C+4]}, valphah)); 92 const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah)); 93 const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah)); 111 const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah)); 112 const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
|
D | wasmsimd.c.in | 37 const v128_t valphah = wasm_v32x4_load_splat(weights); 63 …28_t vt${ABC[C:C+4]} = wasm_f32x4_add(vtl${ABC[C:C+4]}, wasm_f32x4_mul(vtd${ABC[C:C+4]}, valphah)); 64 …28_t vb${ABC[C:C+4]} = wasm_f32x4_add(vbl${ABC[C:C+4]}, wasm_f32x4_mul(vbd${ABC[C:C+4]}, valphah)); 89 const v128_t vt = wasm_f32x4_add(vtl, wasm_f32x4_mul(vtd, valphah)); 90 const v128_t vb = wasm_f32x4_add(vbl, wasm_f32x4_mul(vbd, valphah)); 105 const v128_t vt = wasm_f32x4_add(vtl, wasm_f32x4_mul(vtd, valphah)); 106 const v128_t vb = wasm_f32x4_add(vbl, wasm_f32x4_mul(vbd, valphah));
|
D | neon.c.in | 42 const float32x4_t valphah = vdupq_lane_f32(valphahv, 0); 61 … const float32x4_t vt${ABC[C:C+4]} = vfmaq_f32(vtl${ABC[C:C+4]}, vtd${ABC[C:C+4]}, valphah); 62 … const float32x4_t vb${ABC[C:C+4]} = vfmaq_f32(vbl${ABC[C:C+4]}, vbd${ABC[C:C+4]}, valphah); 103 const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); 104 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); 138 const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); 139 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah);
|
/external/XNNPACK/src/f32-ibilinear-chw/gen/ |
D | scalar-p1.c | 39 const float valphah = w[0]; in xnn_f32_ibilinear_chw_ukernel__scalar_p1() local 51 const float vt = vtl + vtd * valphah; in xnn_f32_ibilinear_chw_ukernel__scalar_p1() 52 const float vb = vbl + vbd * valphah; in xnn_f32_ibilinear_chw_ukernel__scalar_p1()
|
D | scalar-p2.c | 82 const float valphah = w[0]; in xnn_f32_ibilinear_chw_ukernel__scalar_p2() local 94 const float vt = vtl + vtd * valphah; in xnn_f32_ibilinear_chw_ukernel__scalar_p2() 95 const float vb = vbl + vbd * valphah; in xnn_f32_ibilinear_chw_ukernel__scalar_p2()
|
D | wasmsimd-p4.c | 59 const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4() local 80 const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4() 91 const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4() local 114 const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p4()
|
D | neonfma-p4.c | 58 const float32x4_t valphah = vw.val[0]; in xnn_f32_ibilinear_chw_ukernel__neonfma_p4() local 81 const float32x4_t vo = vfmaq_f32(vl, vd, valphah); in xnn_f32_ibilinear_chw_ukernel__neonfma_p4() 92 const float32x2_t valphah = vw.val[0]; in xnn_f32_ibilinear_chw_ukernel__neonfma_p4() local 121 const float32x2_t vo = vfma_f32(vl, vd, valphah); in xnn_f32_ibilinear_chw_ukernel__neonfma_p4()
|
D | neon-p4.c | 58 const float32x4_t valphah = vw.val[0]; in xnn_f32_ibilinear_chw_ukernel__neon_p4() local 81 const float32x4_t vo = vmlaq_f32(vl, vd, valphah); in xnn_f32_ibilinear_chw_ukernel__neon_p4() 92 const float32x2_t valphah = vw.val[0]; in xnn_f32_ibilinear_chw_ukernel__neon_p4() local 121 const float32x2_t vo = vmla_f32(vl, vd, valphah); in xnn_f32_ibilinear_chw_ukernel__neon_p4()
|
D | scalar-p4.c | 112 const float valphah = w[0]; in xnn_f32_ibilinear_chw_ukernel__scalar_p4() local 124 const float vt = vtl + vtd * valphah; in xnn_f32_ibilinear_chw_ukernel__scalar_p4() 125 const float vb = vbl + vbd * valphah; in xnn_f32_ibilinear_chw_ukernel__scalar_p4()
|
D | wasmsimd-p8.c | 145 const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8() local 166 const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8() 177 const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8() local 200 const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); in xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8()
|
D | neonfma-p8.c | 146 const float32x4_t valphah = vw.val[0]; in xnn_f32_ibilinear_chw_ukernel__neonfma_p8() local 169 const float32x4_t vo = vfmaq_f32(vl, vd, valphah); in xnn_f32_ibilinear_chw_ukernel__neonfma_p8() 180 const float32x2_t valphah = vw.val[0]; in xnn_f32_ibilinear_chw_ukernel__neonfma_p8() local 209 const float32x2_t vo = vfma_f32(vl, vd, valphah); in xnn_f32_ibilinear_chw_ukernel__neonfma_p8()
|
D | neon-p8.c | 146 const float32x4_t valphah = vw.val[0]; in xnn_f32_ibilinear_chw_ukernel__neon_p8() local 169 const float32x4_t vo = vmlaq_f32(vl, vd, valphah); in xnn_f32_ibilinear_chw_ukernel__neon_p8() 180 const float32x2_t valphah = vw.val[0]; in xnn_f32_ibilinear_chw_ukernel__neon_p8() local 209 const float32x2_t vo = vmla_f32(vl, vd, valphah); in xnn_f32_ibilinear_chw_ukernel__neon_p8()
|