/external/XNNPACK/src/f16-gemm/gen/ |
D | 8x8-minmax-neonfp16arith-ld64.c | 107 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 110 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 111 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 112 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 113 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 114 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 115 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 116 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c0, va6, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 117 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c0, va7, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 128 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 91 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 94 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 95 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 96 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 97 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 98 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 99 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 108 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 109 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 110 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8-minmax-neonfp16arith-ld64.c | 75 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 78 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 79 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 80 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 81 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 88 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 89 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 90 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 91 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 6x16-minmax-neonfp16arith-ld64.c | 97 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 101 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 102 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 103 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 104 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 105 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 106 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 121 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 122 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 123 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16-minmax-neonfp16arith-ld64.c | 115 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 119 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 120 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 121 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 122 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 123 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 124 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 125 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c0, va6, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 126 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c0, va7, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 145 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 79 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 83 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 84 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 85 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 86 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 97 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 98 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 99 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 100 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x8-minmax-neonfp16arith-ld64.c | 51 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64() local 54 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64() 58 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64()
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 8x8-minmax-neonfp16arith-ld64.c | 139 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 142 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 143 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 144 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 145 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 146 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 147 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 148 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c0, va6, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 149 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c0, va7, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 160 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 117 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 120 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 121 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 122 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 123 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 124 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 125 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 134 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 135 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 136 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8-minmax-neonfp16arith-ld64.c | 95 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 98 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 99 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 100 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 101 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 108 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 109 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 110 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 111 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 8x16-minmax-neonfp16arith-ld64.c | 147 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 151 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 152 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 153 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 154 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 155 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 156 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 157 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c0, va6, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 158 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c0, va7, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 177 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 123 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 127 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 128 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 129 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 130 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 131 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 132 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 147 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 148 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 149 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 99 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 103 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 104 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 105 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 106 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 117 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 118 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 119 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 120 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x8-minmax-neonfp16arith-ld64.c | 62 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64() local 65 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64() 69 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64()
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 8x8inc-minmax-neonfp16arith-ld64.c | 109 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() local 112 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 113 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 114 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 115 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 116 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 117 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 118 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c0, va6, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 119 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c0, va7, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 130 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8inc-minmax-neonfp16arith-ld64.c | 93 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() local 96 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 97 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 98 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 99 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 100 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 101 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 110 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 111 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 112 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8inc-minmax-neonfp16arith-ld64.c | 77 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() local 80 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 81 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 82 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 83 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 90 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 91 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 92 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 93 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 8x16inc-minmax-neonfp16arith-ld64.c | 117 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() local 121 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 122 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 123 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 124 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 125 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 126 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 127 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c0, va6, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 128 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c0, va7, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 147 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 99 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 103 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 104 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 105 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 106 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 107 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 108 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 123 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 124 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 125 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 4x16inc-minmax-neonfp16arith-ld64.c | 81 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 85 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 86 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 87 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 88 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 99 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 100 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 101 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 102 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x8inc-minmax-neonfp16arith-ld64.c | 53 …const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_1x8__neonfp16arith_ld64() local 56 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_1x8__neonfp16arith_ld64() 60 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_1x8__neonfp16arith_ld64()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 4x16s4inc-minmax-fma3-broadcast.c | 84 const __m256 vb01234567c0 = _mm256_load_ps(w + 0); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 87 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 88 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c0, vacc1x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 89 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 90 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c0, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 4x16s4-minmax-fma3-broadcast.c | 82 const __m256 vb01234567c0 = _mm256_load_ps(w + 0); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 85 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 86 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c0, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 87 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 88 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c0, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4-minmax-fma3-broadcast.c | 92 const __m256 vb01234567c0 = _mm256_load_ps(w + 0); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 95 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 96 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c0, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 97 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 98 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c0, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 99 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x16s4-minmax-fma3-broadcast.c | 117 const __m256 vb01234567c0 = _mm256_load_ps(w + 0); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 120 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 121 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c0, vacc1x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 122 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c0, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 123 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c0, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 124 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
|