/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x8s4-wasmrelaxedsimd-fma.c | 113 v128_t va4 = wasm_v128_load(a4); in xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma() local 124 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c0); in xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 129 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c0); in xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 135 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 144 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c1); in xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 149 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c1); in xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 155 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 164 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c2); in xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 169 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c2); in xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 175 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma() [all …]
|
D | 5x8s4-minmax-wasmrelaxedsimd-fma.c | 115 v128_t va4 = wasm_v128_load(a4); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() local 126 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 131 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 137 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 146 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c1); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 151 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c1); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 157 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 166 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c2); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 171 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c2); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 177 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() [all …]
|
D | 5x8s4-relu-wasmsimd.c | 113 v128_t va4 = wasm_v128_load(a4); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd() local 124 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd() 129 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd() 135 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd() 144 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd() 149 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd() 155 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd() 164 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd() 169 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd() 175 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd() [all …]
|
D | 5x8s4-wasmsimd.c | 113 v128_t va4 = wasm_v128_load(a4); in xnn_f32_igemm_ukernel_5x8s4__wasmsimd() local 124 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_igemm_ukernel_5x8s4__wasmsimd() 129 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_igemm_ukernel_5x8s4__wasmsimd() 135 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_5x8s4__wasmsimd() 144 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_igemm_ukernel_5x8s4__wasmsimd() 149 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_igemm_ukernel_5x8s4__wasmsimd() 155 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_5x8s4__wasmsimd() 164 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_igemm_ukernel_5x8s4__wasmsimd() 169 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_igemm_ukernel_5x8s4__wasmsimd() 175 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_5x8s4__wasmsimd() [all …]
|
D | 5x8s4-relu-wasmrelaxedsimd-fma.c | 113 v128_t va4 = wasm_v128_load(a4); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() local 124 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c0); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 129 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c0); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 135 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 144 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c1); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 149 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c1); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 155 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 164 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c2); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 169 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c2); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 175 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() [all …]
|
D | 5x8s4-minmax-wasmsimd-arm.c | 115 v128_t va4 = wasm_v128_load(a4); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() local 126 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 131 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 137 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 146 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 151 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 157 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 166 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 171 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() 177 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm() [all …]
|
D | 6x8s4-wasmrelaxedsimd-fma.c | 124 v128_t va4 = wasm_v128_load(a4); in xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma() local 137 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c0); in xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 143 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c0); in xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 150 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 160 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c1); in xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 166 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c1); in xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 173 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 183 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c2); in xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 189 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c2); in xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 196 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma() [all …]
|
D | 5x8s4-minmax-wasmrelaxedsimd.c | 115 v128_t va4 = wasm_v128_load(a4); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() local 126 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 131 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 137 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 146 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 151 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 157 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 166 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 171 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 177 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 5x8s4-wasmrelaxedsimd-fma.c | 88 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma() local 99 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c0); in xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 104 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c0); in xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 110 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 119 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c1); in xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 124 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c1); in xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 130 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 139 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c2); in xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 144 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c2); in xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma() 150 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma() [all …]
|
D | 5x8s4-relu-wasmrelaxedsimd-fma.c | 88 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() local 99 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c0); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 104 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c0); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 110 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 119 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c1); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 124 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c1); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 130 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 139 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c2); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 144 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c2); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() 150 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma() [all …]
|
D | 5x8s4-minmax-wasmrelaxedsimd.c | 90 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() local 101 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 106 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 112 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 121 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 126 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 132 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 141 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 146 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() 152 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd() [all …]
|
D | 5x8s4-minmax-wasmsimd-arm.c | 90 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() local 101 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 106 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 112 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 121 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 126 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 132 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 141 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 146 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() 152 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm() [all …]
|
D | 5x8s4-relu-wasmsimd.c | 88 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd() local 99 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd() 104 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd() 110 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd() 119 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd() 124 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd() 130 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd() 139 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd() 144 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd() 150 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd() [all …]
|
D | 5x8s4-minmax-wasmsimd-x86.c | 90 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() local 101 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 106 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 112 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 121 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 126 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 132 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 141 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 146 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() 152 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86() [all …]
|
D | 5x8s4-minmax-wasmrelaxedsimd-fma.c | 90 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() local 101 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 106 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 112 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 121 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c1); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 126 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c1); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 132 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 141 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c2); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 146 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c2); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 152 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() [all …]
|
D | 5x8s4-wasmsimd.c | 88 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_ukernel_5x8s4__wasmsimd() local 99 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_gemm_ukernel_5x8s4__wasmsimd() 104 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemm_ukernel_5x8s4__wasmsimd() 110 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_5x8s4__wasmsimd() 119 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_gemm_ukernel_5x8s4__wasmsimd() 124 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemm_ukernel_5x8s4__wasmsimd() 130 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_5x8s4__wasmsimd() 139 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_gemm_ukernel_5x8s4__wasmsimd() 144 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemm_ukernel_5x8s4__wasmsimd() 150 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_5x8s4__wasmsimd() [all …]
|
D | 5x8s4-minmax-sse.c | 88 __m128 va4 = _mm_loadu_ps(a4); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() local 99 vacc4x0123 = _mm_add_ps(vacc4x0123, _mm_mul_ps(va4, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 104 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 110 va4 = _mm_shuffle_ps(va4, va4, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 119 vacc4x0123 = _mm_add_ps(vacc4x0123, _mm_mul_ps(va4, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 124 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 130 va4 = _mm_shuffle_ps(va4, va4, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 139 vacc4x0123 = _mm_add_ps(vacc4x0123, _mm_mul_ps(va4, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 144 vacc4x4567 = _mm_add_ps(vacc4x4567, _mm_mul_ps(va4, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() 150 va4 = _mm_shuffle_ps(va4, va4, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_5x8s4__sse() [all …]
|
D | 6x8s4-relu-wasmrelaxedsimd-fma.c | 96 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma() local 109 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c0); in xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma() 115 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c0); in xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma() 122 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma() 132 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c1); in xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma() 138 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c1); in xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma() 145 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma() 155 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c2); in xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma() 161 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c2); in xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma() 168 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma() [all …]
|
D | 6x8s4-wasmsimd.c | 96 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_ukernel_6x8s4__wasmsimd() local 109 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_gemm_ukernel_6x8s4__wasmsimd() 115 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemm_ukernel_6x8s4__wasmsimd() 122 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__wasmsimd() 132 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_gemm_ukernel_6x8s4__wasmsimd() 138 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemm_ukernel_6x8s4__wasmsimd() 145 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__wasmsimd() 155 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_gemm_ukernel_6x8s4__wasmsimd() 161 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemm_ukernel_6x8s4__wasmsimd() 168 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__wasmsimd() [all …]
|
D | 6x8s4-wasmrelaxedsimd-fma.c | 96 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma() local 109 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 115 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 122 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 132 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 138 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 145 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 155 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 161 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c2); in xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma() 168 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma() [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 5x8s4inc-minmax-wasmsimd-x86.c | 92 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() local 103 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 108 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 114 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 123 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 128 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 134 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 143 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 148 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 154 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() [all …]
|
D | 5x8s4inc-minmax-wasmrelaxedsimd-fma.c | 92 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() local 103 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 108 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 114 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 123 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 128 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 134 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 143 vacc4x0123 = __builtin_wasm_fma_f32x4(vacc4x0123, va4, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 148 vacc4x4567 = __builtin_wasm_fma_f32x4(vacc4x4567, va4, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() 154 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma() [all …]
|
D | 5x8s4inc-minmax-wasmsimd-arm.c | 92 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() local 103 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 108 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 114 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 123 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 128 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 134 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 143 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 148 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() 154 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_arm() [all …]
|
D | 5x8s4inc-minmax-relaxedwasmsimd.c | 92 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() local 103 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 108 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 114 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 123 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 128 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 134 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 143 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 148 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() 154 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmsimd_x86() [all …]
|
D | 5x8s4inc-minmax-wasmrelaxedsimd.c | 92 v128_t va4 = wasm_v128_load(a4); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd() local 103 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd() 108 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd() 114 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd() 123 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd() 128 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd() 134 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd() 143 vacc4x0123 = wasm_f32x4_add(vacc4x0123, wasm_f32x4_mul(va4, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd() 148 vacc4x4567 = wasm_f32x4_add(vacc4x4567, wasm_f32x4_mul(va4, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd() 154 va4 = wasm_v32x4_shuffle(va4, va4, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8s4__wasmrelaxedsimd() [all …]
|