• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_dot_prod_q31.c
4  * Description:  Q31 dot product
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/basic_math_functions.h"
30 
31 /**
32   @ingroup groupMath
33  */
34 
35 /**
36   @addtogroup BasicDotProd
37   @{
38  */
39 
40 /**
41   @brief         Dot product of Q31 vectors.
42   @param[in]     pSrcA      points to the first input vector.
43   @param[in]     pSrcB      points to the second input vector.
44   @param[in]     blockSize  number of samples in each vector.
45   @param[out]    result     output result returned here.
46   @return        none
47 
48   @par           Scaling and Overflow Behavior
49                    The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
50                    are truncated to 2.48 format by discarding the lower 14 bits.
51                    The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
52                    There are 15 guard bits in the accumulator and there is no risk of overflow as long as
53                    the length of the vectors is less than 2^16 elements.
54                    The return result is in 16.48 format.
55  */
56 
57 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
58 
59 #include "arm_helium_utils.h"
60 
arm_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t blockSize,q63_t * result)61 void arm_dot_prod_q31(
62     const q31_t * pSrcA,
63     const q31_t * pSrcB,
64     uint32_t blockSize,
65     q63_t * result)
66 {
67     uint32_t  blkCnt;           /* loop counters */
68     q31x4_t vecA;
69     q31x4_t vecB;
70     q63_t     sum = 0LL;
71 
72     /* Compute 4 outputs at a time */
73     blkCnt = blockSize >> 2;
74     while (blkCnt > 0U)
75     {
76         /*
77          * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
78          * Calculate dot product and then store the result in a temporary buffer.
79          */
80         vecA = vld1q(pSrcA);
81         vecB = vld1q(pSrcB);
82         sum = vrmlaldavhaq(sum, vecA, vecB);
83         /*
84          * Decrement the blockSize loop counter
85          */
86         blkCnt--;
87         /*
88          * advance vector source and destination pointers
89          */
90         pSrcA += 4;
91         pSrcB += 4;
92     }
93     /*
94      * tail
95      */
96     blkCnt = blockSize & 3;
97     if (blkCnt > 0U)
98     {
99         mve_pred16_t p0 = vctp32q(blkCnt);
100         vecA = vld1q(pSrcA);
101         vecB = vld1q(pSrcB);
102         sum = vrmlaldavhaq_p(sum, vecA, vecB, p0);
103     }
104 
105     /*
106      * vrmlaldavhaq provides extra intermediate accumulator headroom.
107      * limiting the need of intermediate scaling
108      * Scalar variant uses 2.48 accu format by right shifting accumulators by 14.
109      * 16.48 output conversion is performed outside the loop by scaling accu. by 6
110      */
111     *result = asrl(sum, (14 - 8));
112 }
113 
114 #else
arm_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t blockSize,q63_t * result)115 void arm_dot_prod_q31(
116   const q31_t * pSrcA,
117   const q31_t * pSrcB,
118         uint32_t blockSize,
119         q63_t * result)
120 {
121         uint32_t blkCnt;                               /* Loop counter */
122         q63_t sum = 0;                                 /* Temporary return variable */
123 
124 #if defined (ARM_MATH_LOOPUNROLL)
125 
126   /* Loop unrolling: Compute 4 outputs at a time */
127   blkCnt = blockSize >> 2U;
128 
129   while (blkCnt > 0U)
130   {
131     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
132 
133     /* Calculate dot product and store result in a temporary buffer. */
134     sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
135 
136     sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
137 
138     sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
139 
140     sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
141 
142     /* Decrement loop counter */
143     blkCnt--;
144   }
145 
146   /* Loop unrolling: Compute remaining outputs */
147   blkCnt = blockSize % 0x4U;
148 
149 #else
150 
151   /* Initialize blkCnt with number of samples */
152   blkCnt = blockSize;
153 
154 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
155 
156   while (blkCnt > 0U)
157   {
158     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
159 
160     /* Calculate dot product and store result in a temporary buffer. */
161     sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
162 
163     /* Decrement loop counter */
164     blkCnt--;
165   }
166 
167   /* Store result in destination buffer in 16.48 format */
168   *result = sum;
169 }
170 #endif /* defined(ARM_MATH_MVEI) */
171 
172 /**
173   @} end of BasicDotProd group
174  */
175