1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_config.h"
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
14 #include "vpx_dsp/txfm_common.h"
15
16 #if HAVE_DSPR2
vpx_idct4_rows_dspr2(const int16_t * input,int16_t * output)17 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
18 int16_t step_0, step_1, step_2, step_3;
19 int Temp0, Temp1, Temp2, Temp3;
20 const int const_2_power_13 = 8192;
21 int i;
22
23 for (i = 4; i--; ) {
24 __asm__ __volatile__ (
25 /*
26 temp_1 = (input[0] + input[2]) * cospi_16_64;
27 step_0 = dct_const_round_shift(temp_1);
28
29 temp_2 = (input[0] - input[2]) * cospi_16_64;
30 step_1 = dct_const_round_shift(temp_2);
31 */
32 "lh %[Temp0], 0(%[input]) \n\t"
33 "lh %[Temp1], 4(%[input]) \n\t"
34 "mtlo %[const_2_power_13], $ac0 \n\t"
35 "mthi $zero, $ac0 \n\t"
36 "mtlo %[const_2_power_13], $ac1 \n\t"
37 "mthi $zero, $ac1 \n\t"
38 "add %[Temp2], %[Temp0], %[Temp1] \n\t"
39 "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
40 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
41 "lh %[Temp0], 2(%[input]) \n\t"
42 "lh %[Temp1], 6(%[input]) \n\t"
43 "extp %[step_0], $ac0, 31 \n\t"
44 "mtlo %[const_2_power_13], $ac0 \n\t"
45 "mthi $zero, $ac0 \n\t"
46
47 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
48 "extp %[step_1], $ac1, 31 \n\t"
49 "mtlo %[const_2_power_13], $ac1 \n\t"
50 "mthi $zero, $ac1 \n\t"
51
52 /*
53 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
54 step_2 = dct_const_round_shift(temp1);
55 */
56 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
57 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
58 "extp %[step_2], $ac0, 31 \n\t"
59
60 /*
61 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
62 step_3 = dct_const_round_shift(temp2);
63 */
64 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
65 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
66 "extp %[step_3], $ac1, 31 \n\t"
67
68 /*
69 output[0] = step_0 + step_3;
70 output[4] = step_1 + step_2;
71 output[8] = step_1 - step_2;
72 output[12] = step_0 - step_3;
73 */
74 "add %[Temp0], %[step_0], %[step_3] \n\t"
75 "sh %[Temp0], 0(%[output]) \n\t"
76
77 "add %[Temp1], %[step_1], %[step_2] \n\t"
78 "sh %[Temp1], 8(%[output]) \n\t"
79
80 "sub %[Temp2], %[step_1], %[step_2] \n\t"
81 "sh %[Temp2], 16(%[output]) \n\t"
82
83 "sub %[Temp3], %[step_0], %[step_3] \n\t"
84 "sh %[Temp3], 24(%[output]) \n\t"
85
86 : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
87 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
88 [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
89 [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
90 [output] "+r" (output)
91 : [const_2_power_13] "r" (const_2_power_13),
92 [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
93 [cospi_24_64] "r" (cospi_24_64),
94 [input] "r" (input)
95 );
96
97 input += 4;
98 output += 1;
99 }
100 }
101
vpx_idct4_columns_add_blk_dspr2(int16_t * input,uint8_t * dest,int dest_stride)102 void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
103 int dest_stride) {
104 int16_t step_0, step_1, step_2, step_3;
105 int Temp0, Temp1, Temp2, Temp3;
106 const int const_2_power_13 = 8192;
107 int i;
108 uint8_t *dest_pix;
109 uint8_t *cm = vpx_ff_cropTbl;
110
111 /* prefetch vpx_ff_cropTbl */
112 prefetch_load(vpx_ff_cropTbl);
113 prefetch_load(vpx_ff_cropTbl + 32);
114 prefetch_load(vpx_ff_cropTbl + 64);
115 prefetch_load(vpx_ff_cropTbl + 96);
116 prefetch_load(vpx_ff_cropTbl + 128);
117 prefetch_load(vpx_ff_cropTbl + 160);
118 prefetch_load(vpx_ff_cropTbl + 192);
119 prefetch_load(vpx_ff_cropTbl + 224);
120
121 for (i = 0; i < 4; ++i) {
122 dest_pix = (dest + i);
123
124 __asm__ __volatile__ (
125 /*
126 temp_1 = (input[0] + input[2]) * cospi_16_64;
127 step_0 = dct_const_round_shift(temp_1);
128
129 temp_2 = (input[0] - input[2]) * cospi_16_64;
130 step_1 = dct_const_round_shift(temp_2);
131 */
132 "lh %[Temp0], 0(%[input]) \n\t"
133 "lh %[Temp1], 4(%[input]) \n\t"
134 "mtlo %[const_2_power_13], $ac0 \n\t"
135 "mthi $zero, $ac0 \n\t"
136 "mtlo %[const_2_power_13], $ac1 \n\t"
137 "mthi $zero, $ac1 \n\t"
138 "add %[Temp2], %[Temp0], %[Temp1] \n\t"
139 "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
140 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
141 "lh %[Temp0], 2(%[input]) \n\t"
142 "lh %[Temp1], 6(%[input]) \n\t"
143 "extp %[step_0], $ac0, 31 \n\t"
144 "mtlo %[const_2_power_13], $ac0 \n\t"
145 "mthi $zero, $ac0 \n\t"
146
147 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
148 "extp %[step_1], $ac1, 31 \n\t"
149 "mtlo %[const_2_power_13], $ac1 \n\t"
150 "mthi $zero, $ac1 \n\t"
151
152 /*
153 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
154 step_2 = dct_const_round_shift(temp1);
155 */
156 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
157 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
158 "extp %[step_2], $ac0, 31 \n\t"
159
160 /*
161 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
162 step_3 = dct_const_round_shift(temp2);
163 */
164 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
165 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
166 "extp %[step_3], $ac1, 31 \n\t"
167
168 /*
169 output[0] = step_0 + step_3;
170 output[4] = step_1 + step_2;
171 output[8] = step_1 - step_2;
172 output[12] = step_0 - step_3;
173 */
174 "add %[Temp0], %[step_0], %[step_3] \n\t"
175 "addi %[Temp0], %[Temp0], 8 \n\t"
176 "sra %[Temp0], %[Temp0], 4 \n\t"
177 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
178 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
179 "add %[Temp0], %[step_1], %[step_2] \n\t"
180 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
181 "sb %[Temp2], 0(%[dest_pix]) \n\t"
182 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
183
184 "addi %[Temp0], %[Temp0], 8 \n\t"
185 "sra %[Temp0], %[Temp0], 4 \n\t"
186 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
187 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
188 "sub %[Temp0], %[step_1], %[step_2] \n\t"
189 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
190 "sb %[Temp2], 0(%[dest_pix]) \n\t"
191 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
192
193 "addi %[Temp0], %[Temp0], 8 \n\t"
194 "sra %[Temp0], %[Temp0], 4 \n\t"
195 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
196 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
197 "sub %[Temp0], %[step_0], %[step_3] \n\t"
198 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
199 "sb %[Temp2], 0(%[dest_pix]) \n\t"
200 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
201
202 "addi %[Temp0], %[Temp0], 8 \n\t"
203 "sra %[Temp0], %[Temp0], 4 \n\t"
204 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
205 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
206 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
207 "sb %[Temp2], 0(%[dest_pix]) \n\t"
208
209 : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
210 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
211 [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
212 [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
213 [dest_pix] "+r" (dest_pix)
214 : [const_2_power_13] "r" (const_2_power_13),
215 [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
216 [cospi_24_64] "r" (cospi_24_64),
217 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
218 );
219
220 input += 4;
221 }
222 }
223
vpx_idct4x4_16_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)224 void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
225 int dest_stride) {
226 DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
227 int16_t *outptr = out;
228 uint32_t pos = 45;
229
230 /* bit positon for extract from acc */
231 __asm__ __volatile__ (
232 "wrdsp %[pos], 1 \n\t"
233 :
234 : [pos] "r" (pos)
235 );
236
237 // Rows
238 vpx_idct4_rows_dspr2(input, outptr);
239
240 // Columns
241 vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
242 }
243
vpx_idct4x4_1_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)244 void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
245 int dest_stride) {
246 int a1, absa1;
247 int r;
248 int32_t out;
249 int t2, vector_a1, vector_a;
250 uint32_t pos = 45;
251 int16_t input_dc = input[0];
252
253 /* bit positon for extract from acc */
254 __asm__ __volatile__ (
255 "wrdsp %[pos], 1 \n\t"
256
257 :
258 : [pos] "r" (pos)
259 );
260
261 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
262 __asm__ __volatile__ (
263 "addi %[out], %[out], 8 \n\t"
264 "sra %[a1], %[out], 4 \n\t"
265
266 : [out] "+r" (out), [a1] "=r" (a1)
267 :
268 );
269
270 if (a1 < 0) {
271 /* use quad-byte
272 * input and output memory are four byte aligned */
273 __asm__ __volatile__ (
274 "abs %[absa1], %[a1] \n\t"
275 "replv.qb %[vector_a1], %[absa1] \n\t"
276
277 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
278 : [a1] "r" (a1)
279 );
280
281 for (r = 4; r--;) {
282 __asm__ __volatile__ (
283 "lw %[t2], 0(%[dest]) \n\t"
284 "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
285 "sw %[vector_a], 0(%[dest]) \n\t"
286 "add %[dest], %[dest], %[dest_stride] \n\t"
287
288 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
289 [dest] "+&r" (dest)
290 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
291 );
292 }
293 } else {
294 /* use quad-byte
295 * input and output memory are four byte aligned */
296 __asm__ __volatile__ (
297 "replv.qb %[vector_a1], %[a1] \n\t"
298 : [vector_a1] "=r" (vector_a1)
299 : [a1] "r" (a1)
300 );
301
302 for (r = 4; r--;) {
303 __asm__ __volatile__ (
304 "lw %[t2], 0(%[dest]) \n\t"
305 "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
306 "sw %[vector_a], 0(%[dest]) \n\t"
307 "add %[dest], %[dest], %[dest_stride] \n\t"
308
309 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
310 [dest] "+&r" (dest)
311 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
312 );
313 }
314 }
315 }
316
iadst4_dspr2(const int16_t * input,int16_t * output)317 void iadst4_dspr2(const int16_t *input, int16_t *output) {
318 int s0, s1, s2, s3, s4, s5, s6, s7;
319 int x0, x1, x2, x3;
320
321 x0 = input[0];
322 x1 = input[1];
323 x2 = input[2];
324 x3 = input[3];
325
326 if (!(x0 | x1 | x2 | x3)) {
327 output[0] = output[1] = output[2] = output[3] = 0;
328 return;
329 }
330
331 s0 = sinpi_1_9 * x0;
332 s1 = sinpi_2_9 * x0;
333 s2 = sinpi_3_9 * x1;
334 s3 = sinpi_4_9 * x2;
335 s4 = sinpi_1_9 * x2;
336 s5 = sinpi_2_9 * x3;
337 s6 = sinpi_4_9 * x3;
338 s7 = x0 - x2 + x3;
339
340 x0 = s0 + s3 + s5;
341 x1 = s1 - s4 - s6;
342 x2 = sinpi_3_9 * s7;
343 x3 = s2;
344
345 s0 = x0 + x3;
346 s1 = x1 + x3;
347 s2 = x2;
348 s3 = x0 + x1 - x3;
349
350 // 1-D transform scaling factor is sqrt(2).
351 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
352 // + 1b (addition) = 29b.
353 // Hence the output bit depth is 15b.
354 output[0] = dct_const_round_shift(s0);
355 output[1] = dct_const_round_shift(s1);
356 output[2] = dct_const_round_shift(s2);
357 output[3] = dct_const_round_shift(s3);
358 }
359 #endif // #if HAVE_DSPR2
360