• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_dct4_q31.c
4  * Description:  Processing function of DCT4 & IDCT4 Q31
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions.h"
30 
31 /**
32   @addtogroup DCT4_IDCT4
33   @{
34  */
35 
36 /**
37   @brief         Processing function for the Q31 DCT4/IDCT4.
38   @param[in]     S             points to an instance of the Q31 DCT4 structure.
39   @param[in]     pState        points to state buffer.
40   @param[in,out] pInlineBuffer points to the in-place input and output buffer.
41   @return        none
42 
43   @par           Input an output formats
44                    Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
45                    as the conversion from DCT2 to DCT4 involves one subtraction.
46                    Internally inputs are downscaled in the RFFT process function to avoid overflows.
47                    Number of bits downscaled, depends on the size of the transform.
48                    The input and output formats for different DCT sizes and number of bits to upscale are
49                    mentioned in the table below:
50 
51                    \image html dct4FormatsQ31Table.gif
52  */
53 
arm_dct4_q31(const arm_dct4_instance_q31 * S,q31_t * pState,q31_t * pInlineBuffer)54 void arm_dct4_q31(
55   const arm_dct4_instance_q31 * S,
56         q31_t * pState,
57         q31_t * pInlineBuffer)
58 {
59   const q31_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
60   const q31_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
61         q31_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
62         q31_t in;                                      /* Temporary variable */
63         uint32_t i;                                    /* Loop counter */
64 
65 
66   /* DCT4 computation involves DCT2 (which is calculated using RFFT)
67    * along with some pre-processing and post-processing.
68    * Computational procedure is explained as follows:
69    * (a) Pre-processing involves multiplying input with cos factor,
70    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
71    *              where,
72    *                 r(n) -- output of preprocessing
73    *                 u(n) -- input to preprocessing(actual Source buffer)
74    * (b) Calculation of DCT2 using FFT is divided into three steps:
75    *                  Step1: Re-ordering of even and odd elements of input.
76    *                  Step2: Calculating FFT of the re-ordered input.
77    *                  Step3: Taking the real part of the product of FFT output and weights.
78    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
79    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
80    *                        where,
81    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
82    * (d) Multiplying the output with the normalizing factor sqrt(2/N).
83    */
84 
85   /*-------- Pre-processing ------------*/
86   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
87   arm_mult_q31 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
88   arm_shift_q31 (pInlineBuffer, 1, pInlineBuffer, S->N);
89 
90   /* ----------------------------------------------------------------
91    * Step1: Re-ordering of even and odd elements as
92    *             pState[i] =  pInlineBuffer[2*i] and
93    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
94    ---------------------------------------------------------------------*/
95 
96   /* pS1 initialized to pState */
97   pS1 = pState;
98 
99   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
100   pS2 = pState + (S->N - 1U);
101 
102   /* pbuff initialized to input buffer */
103   pbuff = pInlineBuffer;
104 
105 
106 #if defined (ARM_MATH_LOOPUNROLL)
107 
108   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
109   i = S->Nby2 >> 2U;
110 
111   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
112    ** a second loop below computes the remaining 1 to 3 samples. */
113   do
114   {
115     /* Re-ordering of even and odd elements */
116     /* pState[i] =  pInlineBuffer[2*i] */
117     *pS1++ = *pbuff++;
118     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
119     *pS2-- = *pbuff++;
120 
121     *pS1++ = *pbuff++;
122     *pS2-- = *pbuff++;
123 
124     *pS1++ = *pbuff++;
125     *pS2-- = *pbuff++;
126 
127     *pS1++ = *pbuff++;
128     *pS2-- = *pbuff++;
129 
130     /* Decrement loop counter */
131     i--;
132   } while (i > 0U);
133 
134   /* pbuff initialized to input buffer */
135   pbuff = pInlineBuffer;
136 
137   /* pS1 initialized to pState */
138   pS1 = pState;
139 
140   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
141   i = S->N >> 2U;
142 
143   /* Processing with loop unrolling 4 times as N is always multiple of 4.
144    * Compute 4 outputs at a time */
145   do
146   {
147     /* Writing the re-ordered output back to inplace input buffer */
148     *pbuff++ = *pS1++;
149     *pbuff++ = *pS1++;
150     *pbuff++ = *pS1++;
151     *pbuff++ = *pS1++;
152 
153     /* Decrement the loop counter */
154     i--;
155   } while (i > 0U);
156 
157 
158   /* ---------------------------------------------------------
159    *     Step2: Calculate RFFT for N-point input
160    * ---------------------------------------------------------- */
161   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
162   arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
163 
164   /*----------------------------------------------------------------------
165    *  Step3: Multiply the FFT output with the weights.
166    *----------------------------------------------------------------------*/
167   arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
168 
169   /* The output of complex multiplication is in 3.29 format.
170    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
171   arm_shift_q31 (pState, 2, pState, S->N * 2);
172 
173   /* ----------- Post-processing ---------- */
174   /* DCT-IV can be obtained from DCT-II by the equation,
175    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
176    *       Hence, Y4(0) = Y2(0)/2  */
177   /* Getting only real part from the output and Converting to DCT-IV */
178 
179   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
180   i = (S->N - 1U) >> 2U;
181 
182   /* pbuff initialized to input buffer. */
183   pbuff = pInlineBuffer;
184 
185   /* pS1 initialized to pState */
186   pS1 = pState;
187 
188   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
189   in = *pS1++ >> 1U;
190   /* input buffer acts as inplace, so output values are stored in the input itself. */
191   *pbuff++ = in;
192 
193   /* pState pointer is incremented twice as the real values are located alternatively in the array */
194   pS1++;
195 
196   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
197    ** a second loop below computes the remaining 1 to 3 samples. */
198   do
199   {
200     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
201     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
202     in = *pS1++ - in;
203     *pbuff++ = in;
204     /* points to the next real value */
205     pS1++;
206 
207     in = *pS1++ - in;
208     *pbuff++ = in;
209     pS1++;
210 
211     in = *pS1++ - in;
212     *pbuff++ = in;
213     pS1++;
214 
215     in = *pS1++ - in;
216     *pbuff++ = in;
217     pS1++;
218 
219     /* Decrement the loop counter */
220     i--;
221   } while (i > 0U);
222 
223   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
224    ** No loop unrolling is used. */
225   i = (S->N - 1U) % 0x4U;
226 
227   while (i > 0U)
228   {
229     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
230     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
231     in = *pS1++ - in;
232     *pbuff++ = in;
233 
234     /* points to the next real value */
235     pS1++;
236 
237     /* Decrement loop counter */
238     i--;
239   }
240 
241 
242   /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
243 
244   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
245   i = S->N >> 2U;
246 
247   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
248   pbuff = pInlineBuffer;
249 
250   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
251   do
252   {
253     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
254     in = *pbuff;
255     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
256 
257     in = *pbuff;
258     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
259 
260     in = *pbuff;
261     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
262 
263     in = *pbuff;
264     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
265 
266     /* Decrement loop counter */
267     i--;
268   } while (i > 0U);
269 
270 
271 #else
272 
273   /* Initializing the loop counter to N/2 */
274   i = S->Nby2;
275 
276   do
277   {
278     /* Re-ordering of even and odd elements */
279     /* pState[i] =  pInlineBuffer[2*i] */
280     *pS1++ = *pbuff++;
281     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
282     *pS2-- = *pbuff++;
283 
284     /* Decrement the loop counter */
285     i--;
286   } while (i > 0U);
287 
288   /* pbuff initialized to input buffer */
289   pbuff = pInlineBuffer;
290 
291   /* pS1 initialized to pState */
292   pS1 = pState;
293 
294   /* Initializing the loop counter */
295   i = S->N;
296 
297   do
298   {
299     /* Writing the re-ordered output back to inplace input buffer */
300     *pbuff++ = *pS1++;
301 
302     /* Decrement the loop counter */
303     i--;
304   } while (i > 0U);
305 
306 
307   /* ---------------------------------------------------------
308    *     Step2: Calculate RFFT for N-point input
309    * ---------------------------------------------------------- */
310   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
311   arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
312 
313   /*----------------------------------------------------------------------
314    *  Step3: Multiply the FFT output with the weights.
315    *----------------------------------------------------------------------*/
316   arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
317 
318   /* The output of complex multiplication is in 3.29 format.
319    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
320   arm_shift_q31(pState, 2, pState, S->N * 2);
321 
322   /* ----------- Post-processing ---------- */
323   /* DCT-IV can be obtained from DCT-II by the equation,
324    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
325    *       Hence, Y4(0) = Y2(0)/2  */
326   /* Getting only real part from the output and Converting to DCT-IV */
327 
328   /* pbuff initialized to input buffer. */
329   pbuff = pInlineBuffer;
330 
331   /* pS1 initialized to pState */
332   pS1 = pState;
333 
334   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
335   in = *pS1++ >> 1U;
336   /* input buffer acts as inplace, so output values are stored in the input itself. */
337   *pbuff++ = in;
338 
339   /* pState pointer is incremented twice as the real values are located alternatively in the array */
340   pS1++;
341 
342   /* Initializing the loop counter */
343   i = (S->N - 1U);
344 
345   while (i > 0U)
346   {
347     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
348     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
349     in = *pS1++ - in;
350     *pbuff++ = in;
351 
352     /* points to the next real value */
353     pS1++;
354 
355     /* Decrement loop counter */
356     i--;
357   }
358 
359   /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
360 
361   /* Initializing loop counter */
362   i = S->N;
363 
364   /* pbuff initialized to the pInlineBuffer (now contains the output values) */
365   pbuff = pInlineBuffer;
366 
367   do
368   {
369     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
370     in = *pbuff;
371     *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
372 
373     /* Decrement loop counter */
374     i--;
375   } while (i > 0U);
376 
377 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
378 
379 }
380 
381 /**
382   @} end of DCT4_IDCT4 group
383  */
384