/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-scalar-3x1.c | 134 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() local 138 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 140 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 170 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() local 174 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 176 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-3x4.c | 166 v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() local 169 vo2 = wasm_f32x4_pmin(vmax, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 171 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 234 v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() local 237 vo2 = wasm_f32x4_pmin(vmax, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 240 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 248 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 257 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 260 *o2 = wasm_f32x4_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-3x4.c | 166 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() local 169 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 171 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 234 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() local 237 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 240 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 248 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 257 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 260 *o2 = wasm_f32x4_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
|
D | 3x3p1-minmax-ssse3-3x4.c | 161 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() local 165 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 167 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 230 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() local 234 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 237 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 245 _mm_storel_pi((__m64*) o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 254 vo2 = _mm_movehl_ps(vo2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 257 _mm_store_ss(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4()
|
D | 3x3s2p1-minmax-scalar-3x1.c | 158 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() local 162 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 164 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 202 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() local 206 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 208 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-3x4.c | 155 v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() local 158 vo2 = wasm_f32x4_pmin(vmax, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 160 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 230 v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() local 233 vo2 = wasm_f32x4_pmin(vmax, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 236 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 241 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 247 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 250 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-3x4.c | 155 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() local 158 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 160 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 230 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() local 233 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 236 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 241 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 247 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 250 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
|
D | 3x3p1-minmax-sse-3x4.c | 203 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() local 207 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 209 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 304 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() local 308 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 311 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 319 _mm_storel_pi((__m64*) o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 328 vo2 = _mm_movehl_ps(vo2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 331 _mm_store_ss(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4()
|
D | 3x3p1-minmax-scalar-4x1.c | 154 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 159 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 163 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 199 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 204 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 208 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
|
D | 5x5p2-minmax-scalar-3x1.c | 248 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 252 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 254 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 351 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 355 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 357 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 416 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 420 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 422 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 3x3s2p1-minmax-scalar-4x1.c | 187 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() local 192 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 196 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 242 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() local 247 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 251 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1()
|
D | 3x3p1-minmax-scalar-5x1.c | 174 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 180 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 186 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 228 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 234 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 240 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
|
D | 3x3s2p1-minmax-wasmsimd-x86-loadsplat-3x4.c | 198 v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() local 201 vo2 = wasm_f32x4_pmin(vmax, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 203 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 291 v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() local 294 vo2 = wasm_f32x4_pmin(vmax, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 298 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 303 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 309 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 312 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c | 190 v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 194 vo2 = wasm_f32x4_pmin(vmax, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 199 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 274 v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 278 vo2 = wasm_f32x4_pmin(vmax, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 284 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 294 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 303 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 309 *o2 = wasm_f32x4_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-4x4.c | 190 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local 194 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 199 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 274 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local 278 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 284 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 294 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 303 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 309 *o2 = wasm_f32x4_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-splat-3x4.c | 188 v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() local 191 vo2 = wasm_f32x4_pmin(vmax, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 193 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 281 v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() local 284 vo2 = wasm_f32x4_pmin(vmax, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 288 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 293 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 299 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 302 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
|
D | 3x3p1-minmax-ssse3-4x4.c | 185 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local 190 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 195 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 270 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local 275 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 281 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 291 _mm_storel_pi((__m64*) o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 300 vo2 = _mm_movehl_ps(vo2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 306 _mm_store_ss(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-3x4.c | 188 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() local 191 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 193 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 281 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() local 284 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 288 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 293 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 299 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 302 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-loadsplat-3x4.c | 198 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() local 201 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 203 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 291 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() local 294 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 298 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 303 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 309 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 312 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 281 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 285 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 287 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 369 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 373 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 375 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 430 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 434 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 436 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 284 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 288 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 290 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 375 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 379 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 381 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 439 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 443 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 445 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 251 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 255 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 257 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 357 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 361 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 363 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 425 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 429 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 431 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 3x3p1-minmax-neon-3x4.c | 154 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() local 158 vo2 = vminq_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() 160 vst1q_f32(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() 230 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() local 234 vo2 = vminq_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() 237 vst1q_f32(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() 243 float32x2_t vo2_lo = vget_low_f32(vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() 251 vo2_lo = vget_high_f32(vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4()
|
D | 3x3p1-minmax-neonfma-3x4.c | 154 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() local 158 vo2 = vminq_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() 160 vst1q_f32(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() 230 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() local 234 vo2 = vminq_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() 237 vst1q_f32(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() 243 float32x2_t vo2_lo = vget_low_f32(vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() 251 vo2_lo = vget_high_f32(vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4()
|
/external/XNNPACK/src/f16-dwconv2d-chw/gen/ |
D | 3x3s2p1-minmax-neonfp16arith-3x4.c | 158 float16x4_t vo2 = vmax_f16(vo2p0, vmin); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x4() local 162 vo2 = vmin_f16(vo2, vmax); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x4() 164 vst1_f16(o2, vo2); o2 += 4; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x4() 245 float16x4_t vo2 = vmax_f16(vo2p0, vmin); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x4() local 249 vo2 = vmin_f16(vo2, vmax); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x4() 254 vst1_f16(o2, vo2); o2 += 4; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x4() 259 vst1_lane_u32((void*) o2, vreinterpret_u32_f16(vo2), 0); o2 += 2; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x4() 265 vo2 = vext_f16(vo2, vo2, 2); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x4() 268 vst1_lane_f16(o2, vo2, 0); o2 += 1; in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x4()
|