/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16-minmax-neon-mull-addw-dup.c | 104 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 107 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 110 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 113 const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 118 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 121 const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 124 const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 127 const int16x8_t vprod3x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va3, 0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 132 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 135 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mull-addw-dup.c | 91 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 94 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 97 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 102 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 105 const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 108 const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 113 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 116 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 119 const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 124 const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 2x16-minmax-neon-mull-addw-dup.c | 78 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 81 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 86 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 89 const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 94 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 97 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 102 const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 105 const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 110 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 113 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() [all …]
|
D | 4x8-minmax-neon-mull-addw-dup.c | 96 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 99 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 102 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 105 const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 110 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 113 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 116 const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 119 const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 124 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() 127 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2)); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup() [all …]
|
D | 3x8-minmax-neon-mull-addw-dup.c | 85 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 88 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 91 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 96 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 99 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 102 const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 107 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 110 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 113 const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() 118 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup() [all …]
|
D | 2x8-minmax-neon-mull-addw-dup.c | 74 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 77 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 82 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 85 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 90 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 93 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 98 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 101 const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 106 const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() 109 const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4)); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup() [all …]
|
D | 1x16-minmax-neon-mull-addw-dup.c | 65 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 70 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 75 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 80 const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 85 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 90 const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 95 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 100 const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 105 const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 110 const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() [all …]
|
D | 1x8-minmax-neon-mull-addw-dup.c | 63 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 68 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 73 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 78 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 83 const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 88 const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 93 const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 98 const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 109 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() 116 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16-minmax-neon-mull-addw-dup.c | 87 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 90 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 93 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 96 const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 101 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 104 const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 107 const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 110 const int16x8_t vprod3x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va3, 0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 115 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 118 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mull-addw-dup.c | 76 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 79 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 82 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 87 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 90 const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 93 const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 98 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 101 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 104 const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 109 const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 2x16-minmax-neon-mull-addw-dup.c | 65 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 68 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 73 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 76 const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 81 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 84 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 89 const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 92 const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 97 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 100 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() [all …]
|
D | 4x8-minmax-neon-mull-addw-dup.c | 79 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 82 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 85 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 88 const int16x8_t vprod3x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va3, 0)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 93 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 96 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 99 const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 102 const int16x8_t vprod3x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va3, 1)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 107 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() 110 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2)); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup() [all …]
|
D | 3x8-minmax-neon-mull-addw-dup.c | 70 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 73 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 76 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 81 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 84 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 87 const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 92 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 95 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 98 const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() 103 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup() [all …]
|
D | 2x8-minmax-neon-mull-addw-dup.c | 61 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 64 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 69 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 72 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 77 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 80 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 85 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 88 const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 93 const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() 96 const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4)); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup() [all …]
|
D | 1x16-minmax-neon-mull-addw-dup.c | 54 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 59 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 64 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 69 const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 74 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 79 const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 84 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 89 const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 94 const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 99 const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() [all …]
|
D | 1x8-minmax-neon-mull-addw-dup.c | 52 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 57 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 62 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 67 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 72 const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 77 const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 82 const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 87 const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 98 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() 105 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1)); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup() [all …]
|
/external/libvpx/libvpx/vp8/common/arm/neon/ |
D | sixtappredict_neon.c | 407 d0s8 = vdup_lane_s8(dtmps8, 0); in vp8_sixtap_predict8x4_neon() 408 d1s8 = vdup_lane_s8(dtmps8, 1); in vp8_sixtap_predict8x4_neon() 409 d2s8 = vdup_lane_s8(dtmps8, 2); in vp8_sixtap_predict8x4_neon() 410 d3s8 = vdup_lane_s8(dtmps8, 3); in vp8_sixtap_predict8x4_neon() 411 d4s8 = vdup_lane_s8(dtmps8, 4); in vp8_sixtap_predict8x4_neon() 412 d5s8 = vdup_lane_s8(dtmps8, 5); in vp8_sixtap_predict8x4_neon() 501 d0s8 = vdup_lane_s8(dtmps8, 0); in vp8_sixtap_predict8x4_neon() 502 d1s8 = vdup_lane_s8(dtmps8, 1); in vp8_sixtap_predict8x4_neon() 503 d2s8 = vdup_lane_s8(dtmps8, 2); in vp8_sixtap_predict8x4_neon() 504 d3s8 = vdup_lane_s8(dtmps8, 3); in vp8_sixtap_predict8x4_neon() [all …]
|
/external/XNNPACK/src/qs8-gemm/ |
D | neon-mull-addw-dup.c.in | 76 …nt16x8_t vprod${M}x${ABC[N:N+8]}c${K} = vmull_s8(vb${ABC[N:N+8]}c${K}, vdup_lane_s8(va${M}, ${K})); 91 … const int16x8_t vprod${M}x${ABC[N:N+8]}c0 = vmull_s8(vb${ABC[N:N+8]}c0, vdup_lane_s8(va${M}, 0)); 101 … const int16x8_t vprod${M}x${ABC[N:N+8]}c1 = vmull_s8(vb${ABC[N:N+8]}c1, vdup_lane_s8(va${M}, 1)); 111 … const int16x8_t vprod${M}x${ABC[N:N+8]}c2 = vmull_s8(vb${ABC[N:N+8]}c2, vdup_lane_s8(va${M}, 2)); 121 … const int16x8_t vprod${M}x${ABC[N:N+8]}c3 = vmull_s8(vb${ABC[N:N+8]}c3, vdup_lane_s8(va${M}, 3)); 131 … const int16x8_t vprod${M}x${ABC[N:N+8]}c4 = vmull_s8(vb${ABC[N:N+8]}c4, vdup_lane_s8(va${M}, 4)); 141 … const int16x8_t vprod${M}x${ABC[N:N+8]}c5 = vmull_s8(vb${ABC[N:N+8]}c5, vdup_lane_s8(va${M}, 5)); 151 … const int16x8_t vprod${M}x${ABC[N:N+8]}c6 = vmull_s8(vb${ABC[N:N+8]}c6, vdup_lane_s8(va${M}, 6));
|
/external/XNNPACK/src/qs8-igemm/ |
D | neon-mull-addw-dup.c.in | 84 …nt16x8_t vprod${M}x${ABC[N:N+8]}c${K} = vmull_s8(vb${ABC[N:N+8]}c${K}, vdup_lane_s8(va${M}, ${K})); 99 … const int16x8_t vprod${M}x${ABC[N:N+8]}c0 = vmull_s8(vb${ABC[N:N+8]}c0, vdup_lane_s8(va${M}, 0)); 109 … const int16x8_t vprod${M}x${ABC[N:N+8]}c1 = vmull_s8(vb${ABC[N:N+8]}c1, vdup_lane_s8(va${M}, 1)); 119 … const int16x8_t vprod${M}x${ABC[N:N+8]}c2 = vmull_s8(vb${ABC[N:N+8]}c2, vdup_lane_s8(va${M}, 2)); 129 … const int16x8_t vprod${M}x${ABC[N:N+8]}c3 = vmull_s8(vb${ABC[N:N+8]}c3, vdup_lane_s8(va${M}, 3)); 139 … const int16x8_t vprod${M}x${ABC[N:N+8]}c4 = vmull_s8(vb${ABC[N:N+8]}c4, vdup_lane_s8(va${M}, 4)); 149 … const int16x8_t vprod${M}x${ABC[N:N+8]}c5 = vmull_s8(vb${ABC[N:N+8]}c5, vdup_lane_s8(va${M}, 5)); 159 … const int16x8_t vprod${M}x${ABC[N:N+8]}c6 = vmull_s8(vb${ABC[N:N+8]}c6, vdup_lane_s8(va${M}, 6));
|
/external/llvm-project/clang/test/CodeGen/ |
D | arm_neon_intrinsics.c | 2443 return vdup_lane_s8(a, 7); in test_vdup_lane_s8()
|
/external/clang/test/CodeGen/ |
D | arm_neon_intrinsics.c | 2644 return vdup_lane_s8(a, 7); in test_vdup_lane_s8()
|
/external/neon_2_sse/ |
D | NEON_2_SSE.h | 1712 _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] 12419 _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[… 12420 #define vdup_lane_s8 vdup_lane_u8 macro
|