/external/XNNPACK/src/f16-gemm/gen/ |
D | 1x16-minmax-neonfp16arith-ld64.c | 46 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() local 57 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 62 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 69 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 74 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 81 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 86 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 93 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 98 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 111 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0, vb89ABCDEF); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 64 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 66 float16x8_t vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 68 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 70 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 87 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 101 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 114 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 128 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 141 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 155 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 76 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 78 float16x8_t vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 80 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 82 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 84 float16x8_t vacc4x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 86 float16x8_t vacc5x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 107 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 127 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 144 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 164 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 1x16inc-minmax-neonfp16arith-ld64.c | 48 …float16x8_t vacc0x89ABCDEF = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() local 59 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 64 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 71 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 76 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 83 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 88 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 95 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 100 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 113 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0, vb89ABCDEF); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() [all …]
|
D | 4x16inc-minmax-neonfp16arith-ld64.c | 66 …float16x8_t vacc0x89ABCDEF = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 89 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 103 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 116 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 130 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 143 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 157 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 170 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 184 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 206 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0, vb89ABCDEF); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 1x16-minmax-neonfp16arith-ld64.c | 48 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() local 68 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 73 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 80 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 85 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 92 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 97 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 104 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 109 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 120 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0, vb89ABCDEF); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 60 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 62 float16x8_t vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 64 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 66 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 107 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 121 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 134 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 148 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 161 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 175 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 68 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 70 float16x8_t vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 72 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 74 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 76 float16x8_t vacc4x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 78 float16x8_t vacc5x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 133 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 153 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 170 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 190 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 1x16s4-minmax-fma3-broadcast.c | 43 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local 56 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 64 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 72 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 80 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 96 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 104 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 108 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 112 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 122 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
|
D | 3x16s4-minmax-fma3-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 78 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 92 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 106 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 120 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 144 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 156 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 164 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 1x16-minmax-fma3-broadcast.c | 43 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast() local 56 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast() 63 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast() 67 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast() 71 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast() 81 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast()
|
D | 1x16-minmax-avx-broadcast.c | 43 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast() local 56 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast() 63 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast() 67 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast() 71 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast() 81 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast()
|
D | 4x16s4-minmax-fma3-broadcast.c | 61 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 63 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 65 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 67 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 89 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 106 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 123 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 140 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 168 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 182 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 5x16s4-minmax-fma3-broadcast.c | 67 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 69 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 71 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 73 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 75 __m256 vacc4x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 100 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 120 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 140 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 160 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 192 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 1x16s4inc-minmax-fma3-broadcast.c | 45 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() local 58 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 66 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 74 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 82 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 98 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 106 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 110 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 114 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 124 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
|
D | 1x16inc-minmax-fma3-broadcast.c | 45 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast() local 58 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast() 65 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast() 69 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast() 73 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast() 83 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast()
|
D | 1x16inc-minmax-avx-broadcast.c | 45 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast() local 58 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast() 65 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast() 69 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast() 73 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast() 83 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast()
|
D | 3x16s4inc-minmax-fma3-broadcast.c | 57 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() local 80 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 94 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 108 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 122 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 146 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 158 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 166 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 178 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 194 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 1x16s4-minmax-fma3-broadcast.c | 47 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() local 69 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 77 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 85 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 93 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 109 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 119 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 123 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 127 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 136 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
|
D | 3x16s4-minmax-fma3-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 97 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 111 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 125 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 139 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 163 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 177 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 185 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 1x16-minmax-avx-broadcast.c | 47 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast() local 69 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast() 77 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast() 81 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast() 85 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast() 94 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast()
|
D | 1x16-minmax-fma3-broadcast.c | 47 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast() local 69 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast() 77 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast() 81 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast() 85 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast() 94 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast()
|
D | 4x16s4-minmax-fma3-broadcast.c | 59 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 61 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 63 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 65 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 111 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 128 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 145 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 162 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 190 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 206 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 5x16s4-minmax-fma3-broadcast.c | 63 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 65 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 67 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 69 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 71 __m256 vacc4x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 125 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 145 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 165 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 185 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 217 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
/external/XNNPACK/src/qs8-gavgpool/gen/ |
D | 7p7x-minmax-sse41-c16-acc2.c | 68 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2() local 73 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi4x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2() 77 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi6x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2() 81 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vacc1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2() 85 const __m128i vacc89AB = _mm_add_epi32(vbias, _mm_cvtepi16_epi32(vacc0x89ABCDEF)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2() 86 … = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x89ABCDEF, _mm_cmpgt_epi16(_mm_setzero_si128(), va… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2() 131 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2() local 136 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi4x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2() 140 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi6x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2() 144 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vacc1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2() [all …]
|