/external/XNNPACK/src/f16-gemm/gen/ |
D | 1x16-minmax-neonfp16arith-ld64.c | 46 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() local 57 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 62 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 69 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 74 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 81 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 86 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 93 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 98 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 111 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0, vb89ABCDEF); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 64 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 66 float16x8_t vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 68 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 70 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 87 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 101 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 114 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 128 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 141 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 155 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 1x16-minmax-avx2-broadcast.c | 44 …__m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8))… in xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast() local 57 …vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF),… in xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast() 64 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast() 68 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast() 72 _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC)); in xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast() 83 vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC); in xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast()
|
D | 6x16-minmax-neonfp16arith-ld64.c | 76 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 78 float16x8_t vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 80 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 82 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 84 float16x8_t vacc4x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 86 float16x8_t vacc5x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 107 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 127 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 144 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 164 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 1x16inc-minmax-neonfp16arith-ld64.c | 48 …float16x8_t vacc0x89ABCDEF = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() local 59 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 64 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 71 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 76 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 83 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 88 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 95 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 100 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 113 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0, vb89ABCDEF); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() [all …]
|
D | 4x16inc-minmax-neonfp16arith-ld64.c | 66 …float16x8_t vacc0x89ABCDEF = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 89 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 103 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 116 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 130 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 143 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 157 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 170 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 184 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 206 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0, vb89ABCDEF); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 1x16-minmax-neonfp16arith-ld64.c | 48 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() local 68 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 73 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 80 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 85 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 92 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 97 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 104 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 109 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 120 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0, vb89ABCDEF); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 60 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 62 float16x8_t vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 64 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 66 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 107 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 121 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 134 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 148 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 161 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 175 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 68 …float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t)); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 70 float16x8_t vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 72 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 74 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 76 float16x8_t vacc4x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 78 float16x8_t vacc5x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 133 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 153 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 170 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 190 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 1x16-minmax-avx2-broadcast.c | 48 …__m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8))… in xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast() local 70 …vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF),… in xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast() 79 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast() 83 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast() 87 _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC)); in xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast() 97 vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC); in xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 1x16s4-minmax-fma3-broadcast.c | 43 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local 56 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 64 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 72 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 80 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 96 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 104 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 112 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 120 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 128 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() [all …]
|
D | 3x16s4-minmax-fma3-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 78 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 92 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 106 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 120 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 144 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 158 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 172 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 1x16-minmax-fma3-broadcast.c | 43 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast() local 56 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast() 63 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast() 67 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast() 71 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast() 81 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast()
|
D | 1x16-minmax-avx-broadcast.c | 43 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast() local 56 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast() 63 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast() 67 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast() 71 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast() 81 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast()
|
D | 4x16s4-minmax-fma3-broadcast.c | 61 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 63 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 65 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 67 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 89 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 106 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 123 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 140 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 168 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 185 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() [all …]
|
D | 3x16-minmax-avx-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 78 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 89 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 97 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 109 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 125 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 1x16s4inc-minmax-fma3-broadcast.c | 45 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() local 58 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 66 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 74 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 82 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 98 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 106 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 114 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 122 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 130 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() [all …]
|
D | 3x16s4inc-minmax-fma3-broadcast.c | 57 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() local 80 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 94 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 108 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 122 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 146 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 160 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 174 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 188 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ… in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 200 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 1x16inc-minmax-fma3-broadcast.c | 45 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast() local 58 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast() 65 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast() 69 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast() 73 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast() 83 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_1x16__fma3_broadcast()
|
D | 1x16inc-minmax-avx-broadcast.c | 45 __m256 vacc0x89ABCDEF = _mm256_load_ps(acc + 8); in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast() local 58 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast() 65 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast() 69 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast() 73 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast() 83 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_1x16__avx_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 1x16s4-minmax-fma3-broadcast.c | 47 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() local 69 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 77 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 85 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 93 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 109 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 117 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 125 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 133 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc3, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 143 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() [all …]
|
D | 3x16s4-minmax-fma3-broadcast.c | 55 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() local 57 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 97 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 111 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 125 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 139 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 163 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 177 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 191 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc2, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() [all …]
|
D | 1x16-minmax-fma3-broadcast.c | 47 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast() local 69 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast() 77 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast() 81 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast() 85 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast() 94 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast()
|
D | 1x16-minmax-avx-broadcast.c | 47 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast() local 69 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF)); in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast() 77 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast() 81 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast() 85 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast() 94 vacc0x01234567 = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast()
|
D | 4x16s4-minmax-fma3-broadcast.c | 59 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 61 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 63 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 65 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 111 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 128 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 145 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 162 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 190 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 207 …vacc0x89ABCDEF = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc1, vzero, _CMP_NEQ_OQ… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() [all …]
|