1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_dct4_q31.c
4 * Description: Processing function of DCT4 & IDCT4 Q31
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions.h"
30
31 /**
32 @addtogroup DCT4_IDCT4
33 @{
34 */
35
36 /**
37 @brief Processing function for the Q31 DCT4/IDCT4.
38 @param[in] S points to an instance of the Q31 DCT4 structure.
39 @param[in] pState points to state buffer.
40 @param[in,out] pInlineBuffer points to the in-place input and output buffer.
41 @return none
42
43 @par Input an output formats
44 Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
45 as the conversion from DCT2 to DCT4 involves one subtraction.
46 Internally inputs are downscaled in the RFFT process function to avoid overflows.
47 Number of bits downscaled, depends on the size of the transform.
48 The input and output formats for different DCT sizes and number of bits to upscale are
49 mentioned in the table below:
50
51 \image html dct4FormatsQ31Table.gif
52 */
53
arm_dct4_q31(const arm_dct4_instance_q31 * S,q31_t * pState,q31_t * pInlineBuffer)54 void arm_dct4_q31(
55 const arm_dct4_instance_q31 * S,
56 q31_t * pState,
57 q31_t * pInlineBuffer)
58 {
59 const q31_t *weights = S->pTwiddle; /* Pointer to the Weights table */
60 const q31_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */
61 q31_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */
62 q31_t in; /* Temporary variable */
63 uint32_t i; /* Loop counter */
64
65
66 /* DCT4 computation involves DCT2 (which is calculated using RFFT)
67 * along with some pre-processing and post-processing.
68 * Computational procedure is explained as follows:
69 * (a) Pre-processing involves multiplying input with cos factor,
70 * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
71 * where,
72 * r(n) -- output of preprocessing
73 * u(n) -- input to preprocessing(actual Source buffer)
74 * (b) Calculation of DCT2 using FFT is divided into three steps:
75 * Step1: Re-ordering of even and odd elements of input.
76 * Step2: Calculating FFT of the re-ordered input.
77 * Step3: Taking the real part of the product of FFT output and weights.
78 * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
79 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
80 * where,
81 * Y4 -- DCT4 output, Y2 -- DCT2 output
82 * (d) Multiplying the output with the normalizing factor sqrt(2/N).
83 */
84
85 /*-------- Pre-processing ------------*/
86 /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
87 arm_mult_q31 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
88 arm_shift_q31 (pInlineBuffer, 1, pInlineBuffer, S->N);
89
90 /* ----------------------------------------------------------------
91 * Step1: Re-ordering of even and odd elements as
92 * pState[i] = pInlineBuffer[2*i] and
93 * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
94 ---------------------------------------------------------------------*/
95
96 /* pS1 initialized to pState */
97 pS1 = pState;
98
99 /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
100 pS2 = pState + (S->N - 1U);
101
102 /* pbuff initialized to input buffer */
103 pbuff = pInlineBuffer;
104
105
106 #if defined (ARM_MATH_LOOPUNROLL)
107
108 /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
109 i = S->Nby2 >> 2U;
110
111 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
112 ** a second loop below computes the remaining 1 to 3 samples. */
113 do
114 {
115 /* Re-ordering of even and odd elements */
116 /* pState[i] = pInlineBuffer[2*i] */
117 *pS1++ = *pbuff++;
118 /* pState[N-i-1] = pInlineBuffer[2*i+1] */
119 *pS2-- = *pbuff++;
120
121 *pS1++ = *pbuff++;
122 *pS2-- = *pbuff++;
123
124 *pS1++ = *pbuff++;
125 *pS2-- = *pbuff++;
126
127 *pS1++ = *pbuff++;
128 *pS2-- = *pbuff++;
129
130 /* Decrement loop counter */
131 i--;
132 } while (i > 0U);
133
134 /* pbuff initialized to input buffer */
135 pbuff = pInlineBuffer;
136
137 /* pS1 initialized to pState */
138 pS1 = pState;
139
140 /* Initializing the loop counter to N/4 instead of N for loop unrolling */
141 i = S->N >> 2U;
142
143 /* Processing with loop unrolling 4 times as N is always multiple of 4.
144 * Compute 4 outputs at a time */
145 do
146 {
147 /* Writing the re-ordered output back to inplace input buffer */
148 *pbuff++ = *pS1++;
149 *pbuff++ = *pS1++;
150 *pbuff++ = *pS1++;
151 *pbuff++ = *pS1++;
152
153 /* Decrement the loop counter */
154 i--;
155 } while (i > 0U);
156
157
158 /* ---------------------------------------------------------
159 * Step2: Calculate RFFT for N-point input
160 * ---------------------------------------------------------- */
161 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
162 arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
163
164 /*----------------------------------------------------------------------
165 * Step3: Multiply the FFT output with the weights.
166 *----------------------------------------------------------------------*/
167 arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
168
169 /* The output of complex multiplication is in 3.29 format.
170 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
171 arm_shift_q31 (pState, 2, pState, S->N * 2);
172
173 /* ----------- Post-processing ---------- */
174 /* DCT-IV can be obtained from DCT-II by the equation,
175 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
176 * Hence, Y4(0) = Y2(0)/2 */
177 /* Getting only real part from the output and Converting to DCT-IV */
178
179 /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
180 i = (S->N - 1U) >> 2U;
181
182 /* pbuff initialized to input buffer. */
183 pbuff = pInlineBuffer;
184
185 /* pS1 initialized to pState */
186 pS1 = pState;
187
188 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
189 in = *pS1++ >> 1U;
190 /* input buffer acts as inplace, so output values are stored in the input itself. */
191 *pbuff++ = in;
192
193 /* pState pointer is incremented twice as the real values are located alternatively in the array */
194 pS1++;
195
196 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
197 ** a second loop below computes the remaining 1 to 3 samples. */
198 do
199 {
200 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
201 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
202 in = *pS1++ - in;
203 *pbuff++ = in;
204 /* points to the next real value */
205 pS1++;
206
207 in = *pS1++ - in;
208 *pbuff++ = in;
209 pS1++;
210
211 in = *pS1++ - in;
212 *pbuff++ = in;
213 pS1++;
214
215 in = *pS1++ - in;
216 *pbuff++ = in;
217 pS1++;
218
219 /* Decrement the loop counter */
220 i--;
221 } while (i > 0U);
222
223 /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
224 ** No loop unrolling is used. */
225 i = (S->N - 1U) % 0x4U;
226
227 while (i > 0U)
228 {
229 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
230 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
231 in = *pS1++ - in;
232 *pbuff++ = in;
233
234 /* points to the next real value */
235 pS1++;
236
237 /* Decrement loop counter */
238 i--;
239 }
240
241
242 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
243
244 /* Initializing the loop counter to N/4 instead of N for loop unrolling */
245 i = S->N >> 2U;
246
247 /* pbuff initialized to the pInlineBuffer(now contains the output values) */
248 pbuff = pInlineBuffer;
249
250 /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */
251 do
252 {
253 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
254 in = *pbuff;
255 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
256
257 in = *pbuff;
258 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
259
260 in = *pbuff;
261 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
262
263 in = *pbuff;
264 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
265
266 /* Decrement loop counter */
267 i--;
268 } while (i > 0U);
269
270
271 #else
272
273 /* Initializing the loop counter to N/2 */
274 i = S->Nby2;
275
276 do
277 {
278 /* Re-ordering of even and odd elements */
279 /* pState[i] = pInlineBuffer[2*i] */
280 *pS1++ = *pbuff++;
281 /* pState[N-i-1] = pInlineBuffer[2*i+1] */
282 *pS2-- = *pbuff++;
283
284 /* Decrement the loop counter */
285 i--;
286 } while (i > 0U);
287
288 /* pbuff initialized to input buffer */
289 pbuff = pInlineBuffer;
290
291 /* pS1 initialized to pState */
292 pS1 = pState;
293
294 /* Initializing the loop counter */
295 i = S->N;
296
297 do
298 {
299 /* Writing the re-ordered output back to inplace input buffer */
300 *pbuff++ = *pS1++;
301
302 /* Decrement the loop counter */
303 i--;
304 } while (i > 0U);
305
306
307 /* ---------------------------------------------------------
308 * Step2: Calculate RFFT for N-point input
309 * ---------------------------------------------------------- */
310 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
311 arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
312
313 /*----------------------------------------------------------------------
314 * Step3: Multiply the FFT output with the weights.
315 *----------------------------------------------------------------------*/
316 arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
317
318 /* The output of complex multiplication is in 3.29 format.
319 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
320 arm_shift_q31(pState, 2, pState, S->N * 2);
321
322 /* ----------- Post-processing ---------- */
323 /* DCT-IV can be obtained from DCT-II by the equation,
324 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
325 * Hence, Y4(0) = Y2(0)/2 */
326 /* Getting only real part from the output and Converting to DCT-IV */
327
328 /* pbuff initialized to input buffer. */
329 pbuff = pInlineBuffer;
330
331 /* pS1 initialized to pState */
332 pS1 = pState;
333
334 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
335 in = *pS1++ >> 1U;
336 /* input buffer acts as inplace, so output values are stored in the input itself. */
337 *pbuff++ = in;
338
339 /* pState pointer is incremented twice as the real values are located alternatively in the array */
340 pS1++;
341
342 /* Initializing the loop counter */
343 i = (S->N - 1U);
344
345 while (i > 0U)
346 {
347 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
348 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
349 in = *pS1++ - in;
350 *pbuff++ = in;
351
352 /* points to the next real value */
353 pS1++;
354
355 /* Decrement loop counter */
356 i--;
357 }
358
359 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
360
361 /* Initializing loop counter */
362 i = S->N;
363
364 /* pbuff initialized to the pInlineBuffer (now contains the output values) */
365 pbuff = pInlineBuffer;
366
367 do
368 {
369 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
370 in = *pbuff;
371 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
372
373 /* Decrement loop counter */
374 i--;
375 } while (i > 0U);
376
377 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
378
379 }
380
381 /**
382 @} end of DCT4_IDCT4 group
383 */
384