/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 6x8-neon-dup-ld128.c | 101 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 102 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 103 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 104 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 105 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 106 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 123 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 124 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 125 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 126 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() [all …]
|
D | 6x8-neonfma-dup-ld128.c | 101 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 102 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 103 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 104 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 105 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 106 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 123 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 124 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 125 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 126 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() [all …]
|
D | 4x8-neonfma-dup-ld128.c | 83 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128() 84 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128() 85 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128() 86 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128() 99 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128() 100 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128() 101 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128() 102 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128() 115 const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128() 116 const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128() [all …]
|
D | 4x8-neon-dup-ld128.c | 83 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128() 84 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128() 85 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128() 86 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128() 99 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128() 100 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128() 101 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128() 102 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128() 115 const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128() 116 const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128() [all …]
|
D | 6x8-neon-dup-ld64.c | 100 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64() 101 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64() 102 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64() 103 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64() 104 const float32x4_t va4c0 = vdupq_lane_f32(va4, 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64() 105 const float32x4_t va5c0 = vdupq_lane_f32(va5, 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64() 121 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64() 122 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64() 123 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64() 124 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64() [all …]
|
D | 6x8-neonfma-dup-ld64.c | 100 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64() 101 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64() 102 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64() 103 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64() 104 const float32x4_t va4c0 = vdupq_lane_f32(va4, 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64() 105 const float32x4_t va5c0 = vdupq_lane_f32(va5, 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64() 121 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64() 122 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64() 123 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64() 124 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64() [all …]
|
D | 4x8-neon-dup-ld64.c | 82 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64() 83 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64() 84 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64() 85 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64() 97 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64() 98 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64() 99 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64() 100 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64()
|
D | 4x8-neonfma-dup-ld64.c | 82 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64() 83 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64() 84 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64() 85 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64() 97 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64() 98 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64() 99 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64() 100 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 6x8-neonfma-dup-ld128.c | 127 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 128 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 129 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 130 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 131 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 132 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 149 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 150 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 151 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 152 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() [all …]
|
D | 6x8-neon-dup-ld128.c | 127 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 128 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 129 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 130 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 131 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 132 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 149 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 150 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 151 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 152 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() [all …]
|
D | 4x8-neonfma-dup-ld128.c | 103 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128() 104 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128() 105 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128() 106 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128() 119 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128() 120 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128() 121 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128() 122 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128() 135 const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128() 136 const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128() [all …]
|
D | 4x8-neon-dup-ld128.c | 103 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld128() 104 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld128() 105 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld128() 106 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld128() 119 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld128() 120 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld128() 121 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld128() 122 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld128() 135 const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld128() 136 const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld128() [all …]
|
D | 6x8-neonfma-dup-ld64.c | 126 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64() 127 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64() 128 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64() 129 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64() 130 const float32x4_t va4c0 = vdupq_lane_f32(va4, 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64() 131 const float32x4_t va5c0 = vdupq_lane_f32(va5, 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64() 147 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64() 148 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64() 149 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64() 150 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64() [all …]
|
D | 6x8-neon-dup-ld64.c | 126 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld64() 127 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld64() 128 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld64() 129 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld64() 130 const float32x4_t va4c0 = vdupq_lane_f32(va4, 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld64() 131 const float32x4_t va5c0 = vdupq_lane_f32(va5, 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld64() 147 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld64() 148 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld64() 149 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld64() 150 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld64() [all …]
|
D | 4x8-neon-dup-ld64.c | 102 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld64() 103 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld64() 104 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld64() 105 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld64() 117 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld64() 118 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld64() 119 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld64() 120 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_igemm_ukernel_4x8__neon_dup_ld64()
|
D | 4x8-neonfma-dup-ld64.c | 102 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64() 103 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64() 104 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64() 105 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64() 117 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64() 118 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64() 119 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64() 120 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 6x8-neonfma-dup-ld128.c | 99 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 100 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 101 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 102 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 103 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 104 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 121 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 122 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 123 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 124 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() [all …]
|
D | 6x8-neon-dup-ld128.c | 99 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 100 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 101 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 102 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 103 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 104 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 121 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 122 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 123 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 124 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() [all …]
|
D | 4x8-neonfma-dup-ld128.c | 81 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128() 82 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128() 83 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128() 84 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128() 97 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128() 98 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128() 99 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128() 100 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128() 113 const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128() 114 const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128() [all …]
|
D | 4x8-neon-dup-ld128.c | 81 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld128() 82 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld128() 83 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld128() 84 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld128() 97 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld128() 98 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld128() 99 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld128() 100 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld128() 113 const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld128() 114 const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld128() [all …]
|
D | 6x8-neonfma-dup-ld64.c | 98 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64() 99 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64() 100 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64() 101 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64() 102 const float32x4_t va4c0 = vdupq_lane_f32(va4, 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64() 103 const float32x4_t va5c0 = vdupq_lane_f32(va5, 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64() 119 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64() 120 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64() 121 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64() 122 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64() [all …]
|
D | 6x8-neon-dup-ld64.c | 98 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld64() 99 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld64() 100 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld64() 101 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld64() 102 const float32x4_t va4c0 = vdupq_lane_f32(va4, 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld64() 103 const float32x4_t va5c0 = vdupq_lane_f32(va5, 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld64() 119 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld64() 120 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld64() 121 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld64() 122 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld64() [all …]
|
D | 4x8-neon-dup-ld64.c | 80 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld64() 81 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld64() 82 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld64() 83 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld64() 95 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld64() 96 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld64() 97 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld64() 98 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_gemm_ukernel_4x8__neon_dup_ld64()
|
D | 4x8-neonfma-dup-ld64.c | 80 const float32x4_t va0c0 = vdupq_lane_f32(va0, 0); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64() 81 const float32x4_t va1c0 = vdupq_lane_f32(va1, 0); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64() 82 const float32x4_t va2c0 = vdupq_lane_f32(va2, 0); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64() 83 const float32x4_t va3c0 = vdupq_lane_f32(va3, 0); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64() 95 const float32x4_t va0c1 = vdupq_lane_f32(va0, 1); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64() 96 const float32x4_t va1c1 = vdupq_lane_f32(va1, 1); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64() 97 const float32x4_t va2c1 = vdupq_lane_f32(va2, 1); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64() 98 const float32x4_t va3c1 = vdupq_lane_f32(va3, 1); in xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64()
|
/external/XNNPACK/src/f32-ppmm/gen/ |
D | 8x8-neonfma.c | 109 const float32x4_t va0000 = vdupq_lane_f32(vget_low_f32(va0123), 0); in xnn_f32_ppmm_ukernel_8x8__neonfma() 110 const float32x4_t va1111 = vdupq_lane_f32(vget_low_f32(va0123), 1); in xnn_f32_ppmm_ukernel_8x8__neonfma() 111 const float32x4_t va2222 = vdupq_lane_f32(vget_high_f32(va0123), 0); in xnn_f32_ppmm_ukernel_8x8__neonfma() 112 const float32x4_t va3333 = vdupq_lane_f32(vget_high_f32(va0123), 1); in xnn_f32_ppmm_ukernel_8x8__neonfma() 113 const float32x4_t va4444 = vdupq_lane_f32(vget_low_f32(va4567), 0); in xnn_f32_ppmm_ukernel_8x8__neonfma() 114 const float32x4_t va5555 = vdupq_lane_f32(vget_low_f32(va4567), 1); in xnn_f32_ppmm_ukernel_8x8__neonfma() 115 const float32x4_t va6666 = vdupq_lane_f32(vget_high_f32(va4567), 0); in xnn_f32_ppmm_ukernel_8x8__neonfma() 116 const float32x4_t va7777 = vdupq_lane_f32(vget_high_f32(va4567), 1); in xnn_f32_ppmm_ukernel_8x8__neonfma()
|