/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-wasmsimd-arm-splat-1x4-acc3.c | 109 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() local 110 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 112 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 154 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() local 155 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 158 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 161 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 163 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 166 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-1x4-acc4.c | 110 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() local 111 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 113 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 156 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() local 157 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 160 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 163 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 165 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 168 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c | 118 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() local 119 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 121 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 156 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() local 157 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 160 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 164 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 167 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 170 *o0 = wasm_f32x4_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c | 117 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() local 118 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 120 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 154 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() local 155 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 158 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 162 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 165 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() 168 *o0 = wasm_f32x4_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-1x4.c | 116 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() local 117 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 119 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 152 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() local 153 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 156 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 160 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 163 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() 166 *o0 = wasm_f32x4_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-1x4-acc4.c | 110 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() local 111 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 113 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 156 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() local 157 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 160 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 163 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 165 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() 168 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-1x4.c | 107 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() local 108 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 110 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 150 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() local 151 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 154 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 157 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 159 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() 162 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c | 117 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() local 118 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 120 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 154 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() local 155 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 158 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 162 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 165 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() 168 *o0 = wasm_f32x4_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-1x4-acc2.c | 108 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() local 109 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 111 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 152 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() local 153 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 156 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 159 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 161 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 164 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-1x4-acc2.c | 108 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() local 109 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 111 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 152 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() local 153 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 156 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 159 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 161 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() 164 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-1x4-acc3.c | 109 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() local 110 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 112 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 154 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() local 155 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 158 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 161 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 163 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() 166 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
|
D | 3x3p1-minmax-ssse3-1x4.c | 111 __m128 vo0 = _mm_max_ps(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() local 113 vo0 = _mm_min_ps(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 115 _mm_storeu_ps(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 148 __m128 vo0 = _mm_max_ps(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() local 150 vo0 = _mm_min_ps(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 153 _mm_storeu_ps(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 157 _mm_storel_pi((__m64*) o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 160 vo0 = _mm_movehl_ps(vo0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() 163 _mm_store_ss(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c | 118 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() local 119 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 121 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 156 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() local 157 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 160 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 164 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 167 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() 170 *o0 = wasm_f32x4_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-1x4.c | 116 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() local 117 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 119 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 152 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() local 153 vo0 = wasm_f32x4_min(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 156 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 160 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 163 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() 166 *o0 = wasm_f32x4_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-1x4.c | 107 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() local 108 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 110 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 150 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() local 151 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 154 wasm_v128_store(o0, vo0); o0 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 157 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 159 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() 162 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
|
D | 3x3p1-minmax-scalar-1x1-acc3.c | 94 float vo0 = math_max_f32(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() local 96 vo0 = math_min_f32(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() 98 *o0++ = vo0; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() 114 float vo0 = math_max_f32(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() local 116 vo0 = math_min_f32(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() 118 *o0++ = vo0; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3()
|
D | 3x3p1-minmax-scalar-1x1-acc2.c | 93 float vo0 = math_max_f32(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() local 95 vo0 = math_min_f32(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() 97 *o0++ = vo0; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() 112 float vo0 = math_max_f32(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() local 114 vo0 = math_min_f32(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() 116 *o0++ = vo0; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2()
|
D | 3x3p1-minmax-scalar-1x1.c | 92 float vo0 = math_max_f32(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() local 94 vo0 = math_min_f32(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() 96 *o0++ = vo0; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() 110 float vo0 = math_max_f32(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() local 112 vo0 = math_min_f32(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() 114 *o0++ = vo0; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1()
|
D | 3x3p1-minmax-ssse3-1x4-acc3.c | 113 __m128 vo0 = _mm_max_ps(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3() local 115 vo0 = _mm_min_ps(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3() 117 _mm_storeu_ps(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3() 152 __m128 vo0 = _mm_max_ps(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3() local 154 vo0 = _mm_min_ps(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3() 157 _mm_storeu_ps(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3() 161 _mm_storel_pi((__m64*) o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3() 164 vo0 = _mm_movehl_ps(vo0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3() 167 _mm_store_ss(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3()
|
D | 3x3p1-minmax-ssse3-1x4-acc2.c | 112 __m128 vo0 = _mm_max_ps(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2() local 114 vo0 = _mm_min_ps(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2() 116 _mm_storeu_ps(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2() 150 __m128 vo0 = _mm_max_ps(vo0p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2() local 152 vo0 = _mm_min_ps(vo0, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2() 155 _mm_storeu_ps(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2() 159 _mm_storel_pi((__m64*) o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2() 162 vo0 = _mm_movehl_ps(vo0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2() 165 _mm_store_ss(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c | 119 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4() local 120 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4() 122 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4() 158 v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4() local 159 vo0 = wasm_f32x4_pmin(vmax, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4() 162 wasm_v128_store(o0, vo0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4() 166 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4() 169 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4() 172 *o0 = wasm_f32x4_extract_lane(vo0, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4()
|
/external/XNNPACK/src/f16-dwconv2d-chw/gen/ |
D | 3x3s2p1-minmax-neonfp16arith-1x4.c | 99 float16x4_t vo0 = vmax_f16(vo0p0, vmin); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4() local 101 vo0 = vmin_f16(vo0, vmax); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4() 103 vst1_f16(o0, vo0); o0 += 4; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4() 144 float16x4_t vo0 = vmax_f16(vo0p0, vmin); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4() local 146 vo0 = vmin_f16(vo0, vmax); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4() 151 vst1_f16(o0, vo0); o0 += 4; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4() 154 vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vo0), 0); o0 += 2; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4() 156 vo0 = vext_f16(vo0, vo0, 2); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4() 159 vst1_lane_f16(o0, vo0, 0); o0 += 1; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4()
|
D | 3x3s2p1-minmax-neonfp16arith-1x4-acc3.c | 101 float16x4_t vo0 = vmax_f16(vo0p0, vmin); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() local 103 vo0 = vmin_f16(vo0, vmax); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() 105 vst1_f16(o0, vo0); o0 += 4; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() 148 float16x4_t vo0 = vmax_f16(vo0p0, vmin); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() local 150 vo0 = vmin_f16(vo0, vmax); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() 155 vst1_f16(o0, vo0); o0 += 4; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() 158 vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vo0), 0); o0 += 2; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() 160 vo0 = vext_f16(vo0, vo0, 2); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() 163 vst1_lane_f16(o0, vo0, 0); o0 += 1; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3()
|
D | 3x3s2p1-minmax-neonfp16arith-1x4-acc2.c | 100 float16x4_t vo0 = vmax_f16(vo0p0, vmin); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc2() local 102 vo0 = vmin_f16(vo0, vmax); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc2() 104 vst1_f16(o0, vo0); o0 += 4; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc2() 146 float16x4_t vo0 = vmax_f16(vo0p0, vmin); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc2() local 148 vo0 = vmin_f16(vo0, vmax); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc2() 153 vst1_f16(o0, vo0); o0 += 4; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc2() 156 vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vo0), 0); o0 += 2; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc2() 158 vo0 = vext_f16(vo0, vo0, 2); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc2() 161 vst1_lane_f16(o0, vo0, 0); o0 += 1; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc2()
|
D | 3x3s2p1-minmax-neonfp16arith-1x4-acc4.c | 102 float16x4_t vo0 = vmax_f16(vo0p0, vmin); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() local 104 vo0 = vmin_f16(vo0, vmax); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 106 vst1_f16(o0, vo0); o0 += 4; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 150 float16x4_t vo0 = vmax_f16(vo0p0, vmin); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() local 152 vo0 = vmin_f16(vo0, vmax); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 157 vst1_f16(o0, vo0); o0 += 4; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 160 vst1_lane_u32((void*) o0, vreinterpret_u32_f16(vo0), 0); o0 += 2; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 162 vo0 = vext_f16(vo0, vo0, 2); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 165 vst1_lane_f16(o0, vo0, 0); o0 += 1; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4()
|