1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_config.h"
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
14 #include "vpx_dsp/txfm_common.h"
15
16 #if HAVE_DSPR2
idct16_rows_dspr2(const int16_t * input,int16_t * output,uint32_t no_rows)17 void idct16_rows_dspr2(const int16_t *input, int16_t *output,
18 uint32_t no_rows) {
19 int i;
20 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
21 int step1_10, step1_11, step1_12, step1_13;
22 int step2_0, step2_1, step2_2, step2_3;
23 int step2_8, step2_9, step2_10, step2_11;
24 int step2_12, step2_13, step2_14, step2_15;
25 int load1, load2, load3, load4, load5, load6, load7, load8;
26 int result1, result2, result3, result4;
27 const int const_2_power_13 = 8192;
28
29 for (i = no_rows; i--; ) {
30 /* prefetch row */
31 prefetch_load((const uint8_t *)(input + 16));
32
33 __asm__ __volatile__ (
34 "lh %[load1], 0(%[input]) \n\t"
35 "lh %[load2], 16(%[input]) \n\t"
36 "lh %[load3], 8(%[input]) \n\t"
37 "lh %[load4], 24(%[input]) \n\t"
38
39 "mtlo %[const_2_power_13], $ac1 \n\t"
40 "mthi $zero, $ac1 \n\t"
41 "mtlo %[const_2_power_13], $ac2 \n\t"
42 "mthi $zero, $ac2 \n\t"
43 "add %[result1], %[load1], %[load2] \n\t"
44 "sub %[result2], %[load1], %[load2] \n\t"
45 "madd $ac1, %[result1], %[cospi_16_64] \n\t"
46 "madd $ac2, %[result2], %[cospi_16_64] \n\t"
47 "extp %[step2_0], $ac1, 31 \n\t"
48 "extp %[step2_1], $ac2, 31 \n\t"
49
50 "mtlo %[const_2_power_13], $ac3 \n\t"
51 "mthi $zero, $ac3 \n\t"
52 "madd $ac3, %[load3], %[cospi_24_64] \n\t"
53 "msub $ac3, %[load4], %[cospi_8_64] \n\t"
54 "extp %[step2_2], $ac3, 31 \n\t"
55
56 "mtlo %[const_2_power_13], $ac1 \n\t"
57 "mthi $zero, $ac1 \n\t"
58 "madd $ac1, %[load3], %[cospi_8_64] \n\t"
59 "madd $ac1, %[load4], %[cospi_24_64] \n\t"
60 "extp %[step2_3], $ac1, 31 \n\t"
61
62 "add %[step1_0], %[step2_0], %[step2_3] \n\t"
63 "add %[step1_1], %[step2_1], %[step2_2] \n\t"
64 "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
65 "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
66
67 : [load1] "=&r" (load1), [load2] "=&r" (load2),
68 [load3] "=&r" (load3), [load4] "=&r" (load4),
69 [result1] "=&r" (result1), [result2] "=&r" (result2),
70 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
71 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
72 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
73 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
74 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
75 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
76 [cospi_16_64] "r" (cospi_16_64)
77 );
78
79 __asm__ __volatile__ (
80 "lh %[load5], 2(%[input]) \n\t"
81 "lh %[load6], 30(%[input]) \n\t"
82 "lh %[load7], 18(%[input]) \n\t"
83 "lh %[load8], 14(%[input]) \n\t"
84
85 "mtlo %[const_2_power_13], $ac1 \n\t"
86 "mthi $zero, $ac1 \n\t"
87 "mtlo %[const_2_power_13], $ac3 \n\t"
88 "mthi $zero, $ac3 \n\t"
89
90 "madd $ac1, %[load5], %[cospi_30_64] \n\t"
91 "msub $ac1, %[load6], %[cospi_2_64] \n\t"
92 "extp %[result1], $ac1, 31 \n\t"
93
94 "madd $ac3, %[load7], %[cospi_14_64] \n\t"
95 "msub $ac3, %[load8], %[cospi_18_64] \n\t"
96 "extp %[result2], $ac3, 31 \n\t"
97
98 "mtlo %[const_2_power_13], $ac1 \n\t"
99 "mthi $zero, $ac1 \n\t"
100 "mtlo %[const_2_power_13], $ac2 \n\t"
101 "mthi $zero, $ac2 \n\t"
102
103 "madd $ac1, %[load7], %[cospi_18_64] \n\t"
104 "madd $ac1, %[load8], %[cospi_14_64] \n\t"
105 "extp %[result3], $ac1, 31 \n\t"
106
107 "madd $ac2, %[load5], %[cospi_2_64] \n\t"
108 "madd $ac2, %[load6], %[cospi_30_64] \n\t"
109 "extp %[result4], $ac2, 31 \n\t"
110
111 "sub %[load5], %[result1], %[result2] \n\t"
112 "sub %[load6], %[result4], %[result3] \n\t"
113
114 "mtlo %[const_2_power_13], $ac1 \n\t"
115 "mthi $zero, $ac1 \n\t"
116 "mtlo %[const_2_power_13], $ac3 \n\t"
117 "mthi $zero, $ac3 \n\t"
118
119 "madd $ac1, %[load6], %[cospi_24_64] \n\t"
120 "msub $ac1, %[load5], %[cospi_8_64] \n\t"
121 "madd $ac3, %[load5], %[cospi_24_64] \n\t"
122 "madd $ac3, %[load6], %[cospi_8_64] \n\t"
123
124 "extp %[step2_9], $ac1, 31 \n\t"
125 "extp %[step2_14], $ac3, 31 \n\t"
126 "add %[step2_8], %[result1], %[result2] \n\t"
127 "add %[step2_15], %[result4], %[result3] \n\t"
128
129 : [load5] "=&r" (load5), [load6] "=&r" (load6),
130 [load7] "=&r" (load7), [load8] "=&r" (load8),
131 [result1] "=&r" (result1), [result2] "=&r" (result2),
132 [result3] "=&r" (result3), [result4] "=&r" (result4),
133 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
134 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
135 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
136 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
137 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
138 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
139 );
140
141 __asm__ __volatile__ (
142 "lh %[load1], 10(%[input]) \n\t"
143 "lh %[load2], 22(%[input]) \n\t"
144 "lh %[load3], 26(%[input]) \n\t"
145 "lh %[load4], 6(%[input]) \n\t"
146
147 "mtlo %[const_2_power_13], $ac1 \n\t"
148 "mthi $zero, $ac1 \n\t"
149 "mtlo %[const_2_power_13], $ac3 \n\t"
150 "mthi $zero, $ac3 \n\t"
151
152 "madd $ac1, %[load1], %[cospi_22_64] \n\t"
153 "msub $ac1, %[load2], %[cospi_10_64] \n\t"
154 "extp %[result1], $ac1, 31 \n\t"
155
156 "madd $ac3, %[load3], %[cospi_6_64] \n\t"
157 "msub $ac3, %[load4], %[cospi_26_64] \n\t"
158 "extp %[result2], $ac3, 31 \n\t"
159
160 "mtlo %[const_2_power_13], $ac1 \n\t"
161 "mthi $zero, $ac1 \n\t"
162 "mtlo %[const_2_power_13], $ac2 \n\t"
163 "mthi $zero, $ac2 \n\t"
164
165 "madd $ac1, %[load1], %[cospi_10_64] \n\t"
166 "madd $ac1, %[load2], %[cospi_22_64] \n\t"
167 "extp %[result3], $ac1, 31 \n\t"
168
169 "madd $ac2, %[load3], %[cospi_26_64] \n\t"
170 "madd $ac2, %[load4], %[cospi_6_64] \n\t"
171 "extp %[result4], $ac2, 31 \n\t"
172
173 "mtlo %[const_2_power_13], $ac1 \n\t"
174 "mthi $zero, $ac1 \n\t"
175 "mtlo %[const_2_power_13], $ac3 \n\t"
176 "mthi $zero, $ac3 \n\t"
177
178 "sub %[load1], %[result2], %[result1] \n\t"
179 "sub %[load2], %[result4], %[result3] \n\t"
180
181 "msub $ac1, %[load1], %[cospi_24_64] \n\t"
182 "msub $ac1, %[load2], %[cospi_8_64] \n\t"
183 "madd $ac3, %[load2], %[cospi_24_64] \n\t"
184 "msub $ac3, %[load1], %[cospi_8_64] \n\t"
185
186 "extp %[step2_10], $ac1, 31 \n\t"
187 "extp %[step2_13], $ac3, 31 \n\t"
188 "add %[step2_11], %[result1], %[result2] \n\t"
189 "add %[step2_12], %[result4], %[result3] \n\t"
190
191 : [load1] "=&r" (load1), [load2] "=&r" (load2),
192 [load3] "=&r" (load3), [load4] "=&r" (load4),
193 [result1] "=&r" (result1), [result2] "=&r" (result2),
194 [result3] "=&r" (result3), [result4] "=&r" (result4),
195 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
196 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
197 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
198 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
199 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
200 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
201 );
202
203 __asm__ __volatile__ (
204 "lh %[load5], 4(%[input]) \n\t"
205 "lh %[load6], 28(%[input]) \n\t"
206 "lh %[load7], 20(%[input]) \n\t"
207 "lh %[load8], 12(%[input]) \n\t"
208
209 "mtlo %[const_2_power_13], $ac1 \n\t"
210 "mthi $zero, $ac1 \n\t"
211 "mtlo %[const_2_power_13], $ac3 \n\t"
212 "mthi $zero, $ac3 \n\t"
213
214 "madd $ac1, %[load5], %[cospi_28_64] \n\t"
215 "msub $ac1, %[load6], %[cospi_4_64] \n\t"
216 "extp %[result1], $ac1, 31 \n\t"
217
218 "madd $ac3, %[load7], %[cospi_12_64] \n\t"
219 "msub $ac3, %[load8], %[cospi_20_64] \n\t"
220 "extp %[result2], $ac3, 31 \n\t"
221
222 "mtlo %[const_2_power_13], $ac1 \n\t"
223 "mthi $zero, $ac1 \n\t"
224 "mtlo %[const_2_power_13], $ac2 \n\t"
225 "mthi $zero, $ac2 \n\t"
226
227 "madd $ac1, %[load7], %[cospi_20_64] \n\t"
228 "madd $ac1, %[load8], %[cospi_12_64] \n\t"
229 "extp %[result3], $ac1, 31 \n\t"
230
231 "madd $ac2, %[load5], %[cospi_4_64] \n\t"
232 "madd $ac2, %[load6], %[cospi_28_64] \n\t"
233 "extp %[result4], $ac2, 31 \n\t"
234
235 "mtlo %[const_2_power_13], $ac1 \n\t"
236 "mthi $zero, $ac1 \n\t"
237 "mtlo %[const_2_power_13], $ac3 \n\t"
238 "mthi $zero, $ac3 \n\t"
239
240 "sub %[load5], %[result4], %[result3] \n\t"
241 "sub %[load5], %[load5], %[result1] \n\t"
242 "add %[load5], %[load5], %[result2] \n\t"
243
244 "sub %[load6], %[result1], %[result2] \n\t"
245 "sub %[load6], %[load6], %[result3] \n\t"
246 "add %[load6], %[load6], %[result4] \n\t"
247
248 "madd $ac1, %[load5], %[cospi_16_64] \n\t"
249 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
250
251 "extp %[step1_5], $ac1, 31 \n\t"
252 "extp %[step1_6], $ac3, 31 \n\t"
253 "add %[step1_4], %[result1], %[result2] \n\t"
254 "add %[step1_7], %[result4], %[result3] \n\t"
255
256 : [load5] "=&r" (load5), [load6] "=&r" (load6),
257 [load7] "=&r" (load7), [load8] "=&r" (load8),
258 [result1] "=&r" (result1), [result2] "=&r" (result2),
259 [result3] "=&r" (result3), [result4] "=&r" (result4),
260 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
261 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
262 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
263 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
264 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
265 [cospi_16_64] "r" (cospi_16_64)
266 );
267
268 __asm__ __volatile__ (
269 "mtlo %[const_2_power_13], $ac0 \n\t"
270 "mthi $zero, $ac0 \n\t"
271 "mtlo %[const_2_power_13], $ac1 \n\t"
272 "mthi $zero, $ac1 \n\t"
273
274 "sub %[load5], %[step2_14], %[step2_13] \n\t"
275 "sub %[load5], %[load5], %[step2_9] \n\t"
276 "add %[load5], %[load5], %[step2_10] \n\t"
277
278 "madd $ac0, %[load5], %[cospi_16_64] \n\t"
279
280 "sub %[load6], %[step2_14], %[step2_13] \n\t"
281 "sub %[load6], %[load6], %[step2_10] \n\t"
282 "add %[load6], %[load6], %[step2_9] \n\t"
283
284 "madd $ac1, %[load6], %[cospi_16_64] \n\t"
285
286 "mtlo %[const_2_power_13], $ac2 \n\t"
287 "mthi $zero, $ac2 \n\t"
288 "mtlo %[const_2_power_13], $ac3 \n\t"
289 "mthi $zero, $ac3 \n\t"
290
291 "sub %[load5], %[step2_15], %[step2_12] \n\t"
292 "sub %[load5], %[load5], %[step2_8] \n\t"
293 "add %[load5], %[load5], %[step2_11] \n\t"
294
295 "madd $ac2, %[load5], %[cospi_16_64] \n\t"
296
297 "sub %[load6], %[step2_15], %[step2_12] \n\t"
298 "sub %[load6], %[load6], %[step2_11] \n\t"
299 "add %[load6], %[load6], %[step2_8] \n\t"
300
301 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
302
303 "extp %[step1_10], $ac0, 31 \n\t"
304 "extp %[step1_13], $ac1, 31 \n\t"
305 "extp %[step1_11], $ac2, 31 \n\t"
306 "extp %[step1_12], $ac3, 31 \n\t"
307
308 : [load5] "=&r" (load5), [load6] "=&r" (load6),
309 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
310 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
311 : [const_2_power_13] "r" (const_2_power_13),
312 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
313 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
314 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
315 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
316 [cospi_16_64] "r" (cospi_16_64)
317 );
318
319 __asm__ __volatile__ (
320 "add %[load5], %[step1_0], %[step1_7] \n\t"
321 "add %[load5], %[load5], %[step2_12] \n\t"
322 "add %[load5], %[load5], %[step2_15] \n\t"
323 "add %[load6], %[step1_1], %[step1_6] \n\t"
324 "add %[load6], %[load6], %[step2_13] \n\t"
325 "add %[load6], %[load6], %[step2_14] \n\t"
326 "sh %[load5], 0(%[output]) \n\t"
327 "sh %[load6], 32(%[output]) \n\t"
328 "sub %[load5], %[step1_1], %[step1_6] \n\t"
329 "add %[load5], %[load5], %[step2_9] \n\t"
330 "add %[load5], %[load5], %[step2_10] \n\t"
331 "sub %[load6], %[step1_0], %[step1_7] \n\t"
332 "add %[load6], %[load6], %[step2_8] \n\t"
333 "add %[load6], %[load6], %[step2_11] \n\t"
334 "sh %[load5], 192(%[output]) \n\t"
335 "sh %[load6], 224(%[output]) \n\t"
336 "sub %[load5], %[step1_0], %[step1_7] \n\t"
337 "sub %[load5], %[load5], %[step2_8] \n\t"
338 "sub %[load5], %[load5], %[step2_11] \n\t"
339 "sub %[load6], %[step1_1], %[step1_6] \n\t"
340 "sub %[load6], %[load6], %[step2_9] \n\t"
341 "sub %[load6], %[load6], %[step2_10] \n\t"
342 "sh %[load5], 256(%[output]) \n\t"
343 "sh %[load6], 288(%[output]) \n\t"
344 "add %[load5], %[step1_1], %[step1_6] \n\t"
345 "sub %[load5], %[load5], %[step2_13] \n\t"
346 "sub %[load5], %[load5], %[step2_14] \n\t"
347 "add %[load6], %[step1_0], %[step1_7] \n\t"
348 "sub %[load6], %[load6], %[step2_12] \n\t"
349 "sub %[load6], %[load6], %[step2_15] \n\t"
350 "sh %[load5], 448(%[output]) \n\t"
351 "sh %[load6], 480(%[output]) \n\t"
352
353 : [load5] "=&r" (load5), [load6] "=&r" (load6)
354 : [output] "r" (output),
355 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
356 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
357 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
358 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
359 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
360 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
361 );
362
363 __asm__ __volatile__ (
364 "add %[load5], %[step1_2], %[step1_5] \n\t"
365 "add %[load5], %[load5], %[step1_13] \n\t"
366 "add %[load6], %[step1_3], %[step1_4] \n\t"
367 "add %[load6], %[load6], %[step1_12] \n\t"
368 "sh %[load5], 64(%[output]) \n\t"
369 "sh %[load6], 96(%[output]) \n\t"
370 "sub %[load5], %[step1_3], %[step1_4] \n\t"
371 "add %[load5], %[load5], %[step1_11] \n\t"
372 "sub %[load6], %[step1_2], %[step1_5] \n\t"
373 "add %[load6], %[load6], %[step1_10] \n\t"
374 "sh %[load5], 128(%[output]) \n\t"
375 "sh %[load6], 160(%[output]) \n\t"
376 "sub %[load5], %[step1_2], %[step1_5] \n\t"
377 "sub %[load5], %[load5], %[step1_10] \n\t"
378 "sub %[load6], %[step1_3], %[step1_4] \n\t"
379 "sub %[load6], %[load6], %[step1_11] \n\t"
380 "sh %[load5], 320(%[output]) \n\t"
381 "sh %[load6], 352(%[output]) \n\t"
382 "add %[load5], %[step1_3], %[step1_4] \n\t"
383 "sub %[load5], %[load5], %[step1_12] \n\t"
384 "add %[load6], %[step1_2], %[step1_5] \n\t"
385 "sub %[load6], %[load6], %[step1_13] \n\t"
386 "sh %[load5], 384(%[output]) \n\t"
387 "sh %[load6], 416(%[output]) \n\t"
388
389 : [load5] "=&r" (load5), [load6] "=&r" (load6)
390 : [output] "r" (output),
391 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
392 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
393 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
394 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
395 );
396
397 input += 16;
398 output += 1;
399 }
400 }
401
idct16_cols_add_blk_dspr2(int16_t * input,uint8_t * dest,int dest_stride)402 void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
403 int dest_stride) {
404 int i;
405 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
406 int step1_8, step1_9, step1_10, step1_11;
407 int step1_12, step1_13, step1_14, step1_15;
408 int step2_0, step2_1, step2_2, step2_3;
409 int step2_8, step2_9, step2_10, step2_11;
410 int step2_12, step2_13, step2_14, step2_15;
411 int load1, load2, load3, load4, load5, load6, load7, load8;
412 int result1, result2, result3, result4;
413 const int const_2_power_13 = 8192;
414 uint8_t *dest_pix;
415 uint8_t *cm = vpx_ff_cropTbl;
416
417 /* prefetch vpx_ff_cropTbl */
418 prefetch_load(vpx_ff_cropTbl);
419 prefetch_load(vpx_ff_cropTbl + 32);
420 prefetch_load(vpx_ff_cropTbl + 64);
421 prefetch_load(vpx_ff_cropTbl + 96);
422 prefetch_load(vpx_ff_cropTbl + 128);
423 prefetch_load(vpx_ff_cropTbl + 160);
424 prefetch_load(vpx_ff_cropTbl + 192);
425 prefetch_load(vpx_ff_cropTbl + 224);
426
427 for (i = 0; i < 16; ++i) {
428 dest_pix = (dest + i);
429 __asm__ __volatile__ (
430 "lh %[load1], 0(%[input]) \n\t"
431 "lh %[load2], 16(%[input]) \n\t"
432 "lh %[load3], 8(%[input]) \n\t"
433 "lh %[load4], 24(%[input]) \n\t"
434
435 "mtlo %[const_2_power_13], $ac1 \n\t"
436 "mthi $zero, $ac1 \n\t"
437 "mtlo %[const_2_power_13], $ac2 \n\t"
438 "mthi $zero, $ac2 \n\t"
439 "add %[result1], %[load1], %[load2] \n\t"
440 "sub %[result2], %[load1], %[load2] \n\t"
441 "madd $ac1, %[result1], %[cospi_16_64] \n\t"
442 "madd $ac2, %[result2], %[cospi_16_64] \n\t"
443 "extp %[step2_0], $ac1, 31 \n\t"
444 "extp %[step2_1], $ac2, 31 \n\t"
445
446 "mtlo %[const_2_power_13], $ac3 \n\t"
447 "mthi $zero, $ac3 \n\t"
448 "madd $ac3, %[load3], %[cospi_24_64] \n\t"
449 "msub $ac3, %[load4], %[cospi_8_64] \n\t"
450 "extp %[step2_2], $ac3, 31 \n\t"
451
452 "mtlo %[const_2_power_13], $ac1 \n\t"
453 "mthi $zero, $ac1 \n\t"
454 "madd $ac1, %[load3], %[cospi_8_64] \n\t"
455 "madd $ac1, %[load4], %[cospi_24_64] \n\t"
456 "extp %[step2_3], $ac1, 31 \n\t"
457
458 "add %[step1_0], %[step2_0], %[step2_3] \n\t"
459 "add %[step1_1], %[step2_1], %[step2_2] \n\t"
460 "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
461 "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
462
463 : [load1] "=&r" (load1), [load2] "=&r" (load2),
464 [load3] "=&r" (load3), [load4] "=&r" (load4),
465 [result1] "=&r" (result1), [result2] "=&r" (result2),
466 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
467 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
468 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
469 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
470 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
471 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
472 [cospi_16_64] "r" (cospi_16_64)
473 );
474
475 __asm__ __volatile__ (
476 "lh %[load5], 2(%[input]) \n\t"
477 "lh %[load6], 30(%[input]) \n\t"
478 "lh %[load7], 18(%[input]) \n\t"
479 "lh %[load8], 14(%[input]) \n\t"
480
481 "mtlo %[const_2_power_13], $ac1 \n\t"
482 "mthi $zero, $ac1 \n\t"
483 "mtlo %[const_2_power_13], $ac3 \n\t"
484 "mthi $zero, $ac3 \n\t"
485
486 "madd $ac1, %[load5], %[cospi_30_64] \n\t"
487 "msub $ac1, %[load6], %[cospi_2_64] \n\t"
488 "extp %[result1], $ac1, 31 \n\t"
489
490 "madd $ac3, %[load7], %[cospi_14_64] \n\t"
491 "msub $ac3, %[load8], %[cospi_18_64] \n\t"
492 "extp %[result2], $ac3, 31 \n\t"
493
494 "mtlo %[const_2_power_13], $ac1 \n\t"
495 "mthi $zero, $ac1 \n\t"
496 "mtlo %[const_2_power_13], $ac2 \n\t"
497 "mthi $zero, $ac2 \n\t"
498
499 "madd $ac1, %[load7], %[cospi_18_64] \n\t"
500 "madd $ac1, %[load8], %[cospi_14_64] \n\t"
501 "extp %[result3], $ac1, 31 \n\t"
502
503 "madd $ac2, %[load5], %[cospi_2_64] \n\t"
504 "madd $ac2, %[load6], %[cospi_30_64] \n\t"
505 "extp %[result4], $ac2, 31 \n\t"
506
507 "sub %[load5], %[result1], %[result2] \n\t"
508 "sub %[load6], %[result4], %[result3] \n\t"
509
510 "mtlo %[const_2_power_13], $ac1 \n\t"
511 "mthi $zero, $ac1 \n\t"
512 "mtlo %[const_2_power_13], $ac3 \n\t"
513 "mthi $zero, $ac3 \n\t"
514
515 "madd $ac1, %[load6], %[cospi_24_64] \n\t"
516 "msub $ac1, %[load5], %[cospi_8_64] \n\t"
517 "madd $ac3, %[load5], %[cospi_24_64] \n\t"
518 "madd $ac3, %[load6], %[cospi_8_64] \n\t"
519
520 "extp %[step2_9], $ac1, 31 \n\t"
521 "extp %[step2_14], $ac3, 31 \n\t"
522 "add %[step2_8], %[result1], %[result2] \n\t"
523 "add %[step2_15], %[result4], %[result3] \n\t"
524
525 : [load5] "=&r" (load5), [load6] "=&r" (load6),
526 [load7] "=&r" (load7), [load8] "=&r" (load8),
527 [result1] "=&r" (result1), [result2] "=&r" (result2),
528 [result3] "=&r" (result3), [result4] "=&r" (result4),
529 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
530 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
531 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
532 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
533 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
534 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
535 );
536
537 __asm__ __volatile__ (
538 "lh %[load1], 10(%[input]) \n\t"
539 "lh %[load2], 22(%[input]) \n\t"
540 "lh %[load3], 26(%[input]) \n\t"
541 "lh %[load4], 6(%[input]) \n\t"
542
543 "mtlo %[const_2_power_13], $ac1 \n\t"
544 "mthi $zero, $ac1 \n\t"
545 "mtlo %[const_2_power_13], $ac3 \n\t"
546 "mthi $zero, $ac3 \n\t"
547
548 "madd $ac1, %[load1], %[cospi_22_64] \n\t"
549 "msub $ac1, %[load2], %[cospi_10_64] \n\t"
550 "extp %[result1], $ac1, 31 \n\t"
551
552 "madd $ac3, %[load3], %[cospi_6_64] \n\t"
553 "msub $ac3, %[load4], %[cospi_26_64] \n\t"
554 "extp %[result2], $ac3, 31 \n\t"
555
556 "mtlo %[const_2_power_13], $ac1 \n\t"
557 "mthi $zero, $ac1 \n\t"
558 "mtlo %[const_2_power_13], $ac2 \n\t"
559 "mthi $zero, $ac2 \n\t"
560
561 "madd $ac1, %[load1], %[cospi_10_64] \n\t"
562 "madd $ac1, %[load2], %[cospi_22_64] \n\t"
563 "extp %[result3], $ac1, 31 \n\t"
564
565 "madd $ac2, %[load3], %[cospi_26_64] \n\t"
566 "madd $ac2, %[load4], %[cospi_6_64] \n\t"
567 "extp %[result4], $ac2, 31 \n\t"
568
569 "mtlo %[const_2_power_13], $ac1 \n\t"
570 "mthi $zero, $ac1 \n\t"
571 "mtlo %[const_2_power_13], $ac3 \n\t"
572 "mthi $zero, $ac3 \n\t"
573
574 "sub %[load1], %[result2], %[result1] \n\t"
575 "sub %[load2], %[result4], %[result3] \n\t"
576
577 "msub $ac1, %[load1], %[cospi_24_64] \n\t"
578 "msub $ac1, %[load2], %[cospi_8_64] \n\t"
579 "madd $ac3, %[load2], %[cospi_24_64] \n\t"
580 "msub $ac3, %[load1], %[cospi_8_64] \n\t"
581
582 "extp %[step2_10], $ac1, 31 \n\t"
583 "extp %[step2_13], $ac3, 31 \n\t"
584 "add %[step2_11], %[result1], %[result2] \n\t"
585 "add %[step2_12], %[result4], %[result3] \n\t"
586
587 : [load1] "=&r" (load1), [load2] "=&r" (load2),
588 [load3] "=&r" (load3), [load4] "=&r" (load4),
589 [result1] "=&r" (result1), [result2] "=&r" (result2),
590 [result3] "=&r" (result3), [result4] "=&r" (result4),
591 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
592 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
593 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
594 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
595 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
596 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
597 );
598
599 __asm__ __volatile__ (
600 "lh %[load5], 4(%[input]) \n\t"
601 "lh %[load6], 28(%[input]) \n\t"
602 "lh %[load7], 20(%[input]) \n\t"
603 "lh %[load8], 12(%[input]) \n\t"
604
605 "mtlo %[const_2_power_13], $ac1 \n\t"
606 "mthi $zero, $ac1 \n\t"
607 "mtlo %[const_2_power_13], $ac3 \n\t"
608 "mthi $zero, $ac3 \n\t"
609
610 "madd $ac1, %[load5], %[cospi_28_64] \n\t"
611 "msub $ac1, %[load6], %[cospi_4_64] \n\t"
612 "extp %[result1], $ac1, 31 \n\t"
613
614 "madd $ac3, %[load7], %[cospi_12_64] \n\t"
615 "msub $ac3, %[load8], %[cospi_20_64] \n\t"
616 "extp %[result2], $ac3, 31 \n\t"
617
618 "mtlo %[const_2_power_13], $ac1 \n\t"
619 "mthi $zero, $ac1 \n\t"
620 "mtlo %[const_2_power_13], $ac2 \n\t"
621 "mthi $zero, $ac2 \n\t"
622
623 "madd $ac1, %[load7], %[cospi_20_64] \n\t"
624 "madd $ac1, %[load8], %[cospi_12_64] \n\t"
625 "extp %[result3], $ac1, 31 \n\t"
626
627 "madd $ac2, %[load5], %[cospi_4_64] \n\t"
628 "madd $ac2, %[load6], %[cospi_28_64] \n\t"
629 "extp %[result4], $ac2, 31 \n\t"
630
631 "mtlo %[const_2_power_13], $ac1 \n\t"
632 "mthi $zero, $ac1 \n\t"
633 "mtlo %[const_2_power_13], $ac3 \n\t"
634 "mthi $zero, $ac3 \n\t"
635
636 "sub %[load5], %[result4], %[result3] \n\t"
637 "sub %[load5], %[load5], %[result1] \n\t"
638 "add %[load5], %[load5], %[result2] \n\t"
639
640 "sub %[load6], %[result1], %[result2] \n\t"
641 "sub %[load6], %[load6], %[result3] \n\t"
642 "add %[load6], %[load6], %[result4] \n\t"
643
644 "madd $ac1, %[load5], %[cospi_16_64] \n\t"
645 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
646
647 "extp %[step1_5], $ac1, 31 \n\t"
648 "extp %[step1_6], $ac3, 31 \n\t"
649
650 "add %[step1_4], %[result1], %[result2] \n\t"
651 "add %[step1_7], %[result4], %[result3] \n\t"
652
653 : [load5] "=&r" (load5), [load6] "=&r" (load6),
654 [load7] "=&r" (load7), [load8] "=&r" (load8),
655 [result1] "=&r" (result1), [result2] "=&r" (result2),
656 [result3] "=&r" (result3), [result4] "=&r" (result4),
657 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
658 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
659 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
660 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
661 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
662 [cospi_16_64] "r" (cospi_16_64)
663 );
664
665 __asm__ __volatile__ (
666 "mtlo %[const_2_power_13], $ac0 \n\t"
667 "mthi $zero, $ac0 \n\t"
668 "mtlo %[const_2_power_13], $ac1 \n\t"
669 "mthi $zero, $ac1 \n\t"
670
671 "sub %[load5], %[step2_14], %[step2_13] \n\t"
672 "sub %[load5], %[load5], %[step2_9] \n\t"
673 "add %[load5], %[load5], %[step2_10] \n\t"
674
675 "madd $ac0, %[load5], %[cospi_16_64] \n\t"
676
677 "sub %[load6], %[step2_14], %[step2_13] \n\t"
678 "sub %[load6], %[load6], %[step2_10] \n\t"
679 "add %[load6], %[load6], %[step2_9] \n\t"
680
681 "madd $ac1, %[load6], %[cospi_16_64] \n\t"
682
683 "mtlo %[const_2_power_13], $ac2 \n\t"
684 "mthi $zero, $ac2 \n\t"
685 "mtlo %[const_2_power_13], $ac3 \n\t"
686 "mthi $zero, $ac3 \n\t"
687
688 "sub %[load5], %[step2_15], %[step2_12] \n\t"
689 "sub %[load5], %[load5], %[step2_8] \n\t"
690 "add %[load5], %[load5], %[step2_11] \n\t"
691
692 "madd $ac2, %[load5], %[cospi_16_64] \n\t"
693
694 "sub %[load6], %[step2_15], %[step2_12] \n\t"
695 "sub %[load6], %[load6], %[step2_11] \n\t"
696 "add %[load6], %[load6], %[step2_8] \n\t"
697
698 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
699
700 "extp %[step1_10], $ac0, 31 \n\t"
701 "extp %[step1_13], $ac1, 31 \n\t"
702 "extp %[step1_11], $ac2, 31 \n\t"
703 "extp %[step1_12], $ac3, 31 \n\t"
704
705 : [load5] "=&r" (load5), [load6] "=&r" (load6),
706 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
707 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
708 : [const_2_power_13] "r" (const_2_power_13),
709 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
710 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
711 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
712 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
713 [cospi_16_64] "r" (cospi_16_64)
714 );
715
716 step1_8 = step2_8 + step2_11;
717 step1_9 = step2_9 + step2_10;
718 step1_14 = step2_13 + step2_14;
719 step1_15 = step2_12 + step2_15;
720
721 __asm__ __volatile__ (
722 "lbu %[load7], 0(%[dest_pix]) \n\t"
723 "add %[load5], %[step1_0], %[step1_7] \n\t"
724 "add %[load5], %[load5], %[step1_15] \n\t"
725 "addi %[load5], %[load5], 32 \n\t"
726 "sra %[load5], %[load5], 6 \n\t"
727 "add %[load7], %[load7], %[load5] \n\t"
728 "lbux %[load5], %[load7](%[cm]) \n\t"
729 "add %[load6], %[step1_1], %[step1_6] \n\t"
730 "add %[load6], %[load6], %[step1_14] \n\t"
731 "sb %[load5], 0(%[dest_pix]) \n\t"
732 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
733 "lbu %[load8], 0(%[dest_pix]) \n\t"
734 "addi %[load6], %[load6], 32 \n\t"
735 "sra %[load6], %[load6], 6 \n\t"
736 "add %[load8], %[load8], %[load6] \n\t"
737 "lbux %[load6], %[load8](%[cm]) \n\t"
738 "sb %[load6], 0(%[dest_pix]) \n\t"
739 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
740
741 "lbu %[load7], 0(%[dest_pix]) \n\t"
742 "add %[load5], %[step1_2], %[step1_5] \n\t"
743 "add %[load5], %[load5], %[step1_13] \n\t"
744 "addi %[load5], %[load5], 32 \n\t"
745 "sra %[load5], %[load5], 6 \n\t"
746 "add %[load7], %[load7], %[load5] \n\t"
747 "lbux %[load5], %[load7](%[cm]) \n\t"
748 "add %[load6], %[step1_3], %[step1_4] \n\t"
749 "add %[load6], %[load6], %[step1_12] \n\t"
750 "sb %[load5], 0(%[dest_pix]) \n\t"
751 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
752 "lbu %[load8], 0(%[dest_pix]) \n\t"
753 "addi %[load6], %[load6], 32 \n\t"
754 "sra %[load6], %[load6], 6 \n\t"
755 "add %[load8], %[load8], %[load6] \n\t"
756 "lbux %[load6], %[load8](%[cm]) \n\t"
757 "sb %[load6], 0(%[dest_pix]) \n\t"
758 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
759
760 "lbu %[load7], 0(%[dest_pix]) \n\t"
761 "sub %[load5], %[step1_3], %[step1_4] \n\t"
762 "add %[load5], %[load5], %[step1_11] \n\t"
763 "addi %[load5], %[load5], 32 \n\t"
764 "sra %[load5], %[load5], 6 \n\t"
765 "add %[load7], %[load7], %[load5] \n\t"
766 "lbux %[load5], %[load7](%[cm]) \n\t"
767 "sub %[load6], %[step1_2], %[step1_5] \n\t"
768 "add %[load6], %[load6], %[step1_10] \n\t"
769 "sb %[load5], 0(%[dest_pix]) \n\t"
770 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
771 "lbu %[load8], 0(%[dest_pix]) \n\t"
772 "addi %[load6], %[load6], 32 \n\t"
773 "sra %[load6], %[load6], 6 \n\t"
774 "add %[load8], %[load8], %[load6] \n\t"
775 "lbux %[load6], %[load8](%[cm]) \n\t"
776 "sb %[load6], 0(%[dest_pix]) \n\t"
777 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
778
779 "sub %[load5], %[step1_1], %[step1_6] \n\t"
780 "lbu %[load7], 0(%[dest_pix]) \n\t"
781 "add %[load5], %[load5], %[step1_9] \n\t"
782 "addi %[load5], %[load5], 32 \n\t"
783 "sra %[load5], %[load5], 6 \n\t"
784 "add %[load7], %[load7], %[load5] \n\t"
785 "lbux %[load5], %[load7](%[cm]) \n\t"
786 "sub %[load6], %[step1_0], %[step1_7] \n\t"
787 "add %[load6], %[load6], %[step1_8] \n\t"
788 "sb %[load5], 0(%[dest_pix]) \n\t"
789 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
790 "lbu %[load8], 0(%[dest_pix]) \n\t"
791 "addi %[load6], %[load6], 32 \n\t"
792 "sra %[load6], %[load6], 6 \n\t"
793 "add %[load8], %[load8], %[load6] \n\t"
794 "lbux %[load6], %[load8](%[cm]) \n\t"
795 "sb %[load6], 0(%[dest_pix]) \n\t"
796 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
797
798 "lbu %[load7], 0(%[dest_pix]) \n\t"
799 "sub %[load5], %[step1_0], %[step1_7] \n\t"
800 "sub %[load5], %[load5], %[step1_8] \n\t"
801 "addi %[load5], %[load5], 32 \n\t"
802 "sra %[load5], %[load5], 6 \n\t"
803 "add %[load7], %[load7], %[load5] \n\t"
804 "lbux %[load5], %[load7](%[cm]) \n\t"
805 "sub %[load6], %[step1_1], %[step1_6] \n\t"
806 "sub %[load6], %[load6], %[step1_9] \n\t"
807 "sb %[load5], 0(%[dest_pix]) \n\t"
808 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
809 "lbu %[load8], 0(%[dest_pix]) \n\t"
810 "addi %[load6], %[load6], 32 \n\t"
811 "sra %[load6], %[load6], 6 \n\t"
812 "add %[load8], %[load8], %[load6] \n\t"
813 "lbux %[load6], %[load8](%[cm]) \n\t"
814 "sb %[load6], 0(%[dest_pix]) \n\t"
815 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
816
817 "lbu %[load7], 0(%[dest_pix]) \n\t"
818 "sub %[load5], %[step1_2], %[step1_5] \n\t"
819 "sub %[load5], %[load5], %[step1_10] \n\t"
820 "addi %[load5], %[load5], 32 \n\t"
821 "sra %[load5], %[load5], 6 \n\t"
822 "add %[load7], %[load7], %[load5] \n\t"
823 "lbux %[load5], %[load7](%[cm]) \n\t"
824 "sub %[load6], %[step1_3], %[step1_4] \n\t"
825 "sub %[load6], %[load6], %[step1_11] \n\t"
826 "sb %[load5], 0(%[dest_pix]) \n\t"
827 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
828 "lbu %[load8], 0(%[dest_pix]) \n\t"
829 "addi %[load6], %[load6], 32 \n\t"
830 "sra %[load6], %[load6], 6 \n\t"
831 "add %[load8], %[load8], %[load6] \n\t"
832 "lbux %[load6], %[load8](%[cm]) \n\t"
833 "sb %[load6], 0(%[dest_pix]) \n\t"
834 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
835
836 "lbu %[load7], 0(%[dest_pix]) \n\t"
837 "add %[load5], %[step1_3], %[step1_4] \n\t"
838 "sub %[load5], %[load5], %[step1_12] \n\t"
839 "addi %[load5], %[load5], 32 \n\t"
840 "sra %[load5], %[load5], 6 \n\t"
841 "add %[load7], %[load7], %[load5] \n\t"
842 "lbux %[load5], %[load7](%[cm]) \n\t"
843 "add %[load6], %[step1_2], %[step1_5] \n\t"
844 "sub %[load6], %[load6], %[step1_13] \n\t"
845 "sb %[load5], 0(%[dest_pix]) \n\t"
846 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
847 "lbu %[load8], 0(%[dest_pix]) \n\t"
848 "addi %[load6], %[load6], 32 \n\t"
849 "sra %[load6], %[load6], 6 \n\t"
850 "add %[load8], %[load8], %[load6] \n\t"
851 "lbux %[load6], %[load8](%[cm]) \n\t"
852 "sb %[load6], 0(%[dest_pix]) \n\t"
853 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
854
855 "lbu %[load7], 0(%[dest_pix]) \n\t"
856 "add %[load5], %[step1_1], %[step1_6] \n\t"
857 "sub %[load5], %[load5], %[step1_14] \n\t"
858 "addi %[load5], %[load5], 32 \n\t"
859 "sra %[load5], %[load5], 6 \n\t"
860 "add %[load7], %[load7], %[load5] \n\t"
861 "lbux %[load5], %[load7](%[cm]) \n\t"
862 "add %[load6], %[step1_0], %[step1_7] \n\t"
863 "sub %[load6], %[load6], %[step1_15] \n\t"
864 "sb %[load5], 0(%[dest_pix]) \n\t"
865 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
866 "lbu %[load8], 0(%[dest_pix]) \n\t"
867 "addi %[load6], %[load6], 32 \n\t"
868 "sra %[load6], %[load6], 6 \n\t"
869 "add %[load8], %[load8], %[load6] \n\t"
870 "lbux %[load6], %[load8](%[cm]) \n\t"
871 "sb %[load6], 0(%[dest_pix]) \n\t"
872
873 : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
874 [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
875 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
876 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
877 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
878 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
879 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
880 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
881 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
882 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
883 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
884 );
885
886 input += 16;
887 }
888 }
889
vpx_idct16x16_256_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)890 void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
891 int dest_stride) {
892 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
893 uint32_t pos = 45;
894
895 /* bit positon for extract from acc */
896 __asm__ __volatile__ (
897 "wrdsp %[pos], 1 \n\t"
898 :
899 : [pos] "r" (pos)
900 );
901
902 // First transform rows
903 idct16_rows_dspr2(input, out, 16);
904
905 // Then transform columns and add to dest
906 idct16_cols_add_blk_dspr2(out, dest, dest_stride);
907 }
908
vpx_idct16x16_10_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)909 void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
910 int dest_stride) {
911 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
912 int16_t *outptr = out;
913 uint32_t i;
914 uint32_t pos = 45;
915
916 /* bit positon for extract from acc */
917 __asm__ __volatile__ (
918 "wrdsp %[pos], 1 \n\t"
919 :
920 : [pos] "r" (pos)
921 );
922
923 // First transform rows. Since all non-zero dct coefficients are in
924 // upper-left 4x4 area, we only need to calculate first 4 rows here.
925 idct16_rows_dspr2(input, outptr, 4);
926
927 outptr += 4;
928 for (i = 0; i < 6; ++i) {
929 __asm__ __volatile__ (
930 "sw $zero, 0(%[outptr]) \n\t"
931 "sw $zero, 32(%[outptr]) \n\t"
932 "sw $zero, 64(%[outptr]) \n\t"
933 "sw $zero, 96(%[outptr]) \n\t"
934 "sw $zero, 128(%[outptr]) \n\t"
935 "sw $zero, 160(%[outptr]) \n\t"
936 "sw $zero, 192(%[outptr]) \n\t"
937 "sw $zero, 224(%[outptr]) \n\t"
938 "sw $zero, 256(%[outptr]) \n\t"
939 "sw $zero, 288(%[outptr]) \n\t"
940 "sw $zero, 320(%[outptr]) \n\t"
941 "sw $zero, 352(%[outptr]) \n\t"
942 "sw $zero, 384(%[outptr]) \n\t"
943 "sw $zero, 416(%[outptr]) \n\t"
944 "sw $zero, 448(%[outptr]) \n\t"
945 "sw $zero, 480(%[outptr]) \n\t"
946
947 :
948 : [outptr] "r" (outptr)
949 );
950
951 outptr += 2;
952 }
953
954 // Then transform columns
955 idct16_cols_add_blk_dspr2(out, dest, dest_stride);
956 }
957
vpx_idct16x16_1_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)958 void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
959 int dest_stride) {
960 uint32_t pos = 45;
961 int32_t out;
962 int32_t r;
963 int32_t a1, absa1;
964 int32_t vector_a1;
965 int32_t t1, t2, t3, t4;
966 int32_t vector_1, vector_2, vector_3, vector_4;
967
968 /* bit positon for extract from acc */
969 __asm__ __volatile__ (
970 "wrdsp %[pos], 1 \n\t"
971
972 :
973 : [pos] "r" (pos)
974 );
975
976 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
977 __asm__ __volatile__ (
978 "addi %[out], %[out], 32 \n\t"
979 "sra %[a1], %[out], 6 \n\t"
980
981 : [out] "+r" (out), [a1] "=r" (a1)
982 :
983 );
984
985 if (a1 < 0) {
986 /* use quad-byte
987 * input and output memory are four byte aligned */
988 __asm__ __volatile__ (
989 "abs %[absa1], %[a1] \n\t"
990 "replv.qb %[vector_a1], %[absa1] \n\t"
991
992 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
993 : [a1] "r" (a1)
994 );
995
996 for (r = 16; r--;) {
997 __asm__ __volatile__ (
998 "lw %[t1], 0(%[dest]) \n\t"
999 "lw %[t2], 4(%[dest]) \n\t"
1000 "lw %[t3], 8(%[dest]) \n\t"
1001 "lw %[t4], 12(%[dest]) \n\t"
1002 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
1003 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
1004 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
1005 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
1006 "sw %[vector_1], 0(%[dest]) \n\t"
1007 "sw %[vector_2], 4(%[dest]) \n\t"
1008 "sw %[vector_3], 8(%[dest]) \n\t"
1009 "sw %[vector_4], 12(%[dest]) \n\t"
1010 "add %[dest], %[dest], %[dest_stride] \n\t"
1011
1012 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
1013 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
1014 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
1015 [dest] "+&r" (dest)
1016 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
1017 );
1018 }
1019 } else {
1020 /* use quad-byte
1021 * input and output memory are four byte aligned */
1022 __asm__ __volatile__ (
1023 "replv.qb %[vector_a1], %[a1] \n\t"
1024
1025 : [vector_a1] "=r" (vector_a1)
1026 : [a1] "r" (a1)
1027 );
1028
1029 for (r = 16; r--;) {
1030 __asm__ __volatile__ (
1031 "lw %[t1], 0(%[dest]) \n\t"
1032 "lw %[t2], 4(%[dest]) \n\t"
1033 "lw %[t3], 8(%[dest]) \n\t"
1034 "lw %[t4], 12(%[dest]) \n\t"
1035 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
1036 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
1037 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
1038 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
1039 "sw %[vector_1], 0(%[dest]) \n\t"
1040 "sw %[vector_2], 4(%[dest]) \n\t"
1041 "sw %[vector_3], 8(%[dest]) \n\t"
1042 "sw %[vector_4], 12(%[dest]) \n\t"
1043 "add %[dest], %[dest], %[dest_stride] \n\t"
1044
1045 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
1046 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
1047 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
1048 [dest] "+&r" (dest)
1049 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
1050 );
1051 }
1052 }
1053 }
1054
iadst16_dspr2(const int16_t * input,int16_t * output)1055 void iadst16_dspr2(const int16_t *input, int16_t *output) {
1056 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
1057
1058 int x0 = input[15];
1059 int x1 = input[0];
1060 int x2 = input[13];
1061 int x3 = input[2];
1062 int x4 = input[11];
1063 int x5 = input[4];
1064 int x6 = input[9];
1065 int x7 = input[6];
1066 int x8 = input[7];
1067 int x9 = input[8];
1068 int x10 = input[5];
1069 int x11 = input[10];
1070 int x12 = input[3];
1071 int x13 = input[12];
1072 int x14 = input[1];
1073 int x15 = input[14];
1074
1075 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
1076 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
1077 output[0] = output[1] = output[2] = output[3] = output[4]
1078 = output[5] = output[6] = output[7] = output[8]
1079 = output[9] = output[10] = output[11] = output[12]
1080 = output[13] = output[14] = output[15] = 0;
1081 return;
1082 }
1083
1084 // stage 1
1085 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1086 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1087 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1088 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1089 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1090 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1091 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1092 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1093 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1094 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1095 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1096 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1097 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1098 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1099 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1100 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1101
1102 x0 = dct_const_round_shift(s0 + s8);
1103 x1 = dct_const_round_shift(s1 + s9);
1104 x2 = dct_const_round_shift(s2 + s10);
1105 x3 = dct_const_round_shift(s3 + s11);
1106 x4 = dct_const_round_shift(s4 + s12);
1107 x5 = dct_const_round_shift(s5 + s13);
1108 x6 = dct_const_round_shift(s6 + s14);
1109 x7 = dct_const_round_shift(s7 + s15);
1110 x8 = dct_const_round_shift(s0 - s8);
1111 x9 = dct_const_round_shift(s1 - s9);
1112 x10 = dct_const_round_shift(s2 - s10);
1113 x11 = dct_const_round_shift(s3 - s11);
1114 x12 = dct_const_round_shift(s4 - s12);
1115 x13 = dct_const_round_shift(s5 - s13);
1116 x14 = dct_const_round_shift(s6 - s14);
1117 x15 = dct_const_round_shift(s7 - s15);
1118
1119 // stage 2
1120 s0 = x0;
1121 s1 = x1;
1122 s2 = x2;
1123 s3 = x3;
1124 s4 = x4;
1125 s5 = x5;
1126 s6 = x6;
1127 s7 = x7;
1128 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1129 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1130 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1131 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1132 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
1133 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1134 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
1135 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1136
1137 x0 = s0 + s4;
1138 x1 = s1 + s5;
1139 x2 = s2 + s6;
1140 x3 = s3 + s7;
1141 x4 = s0 - s4;
1142 x5 = s1 - s5;
1143 x6 = s2 - s6;
1144 x7 = s3 - s7;
1145 x8 = dct_const_round_shift(s8 + s12);
1146 x9 = dct_const_round_shift(s9 + s13);
1147 x10 = dct_const_round_shift(s10 + s14);
1148 x11 = dct_const_round_shift(s11 + s15);
1149 x12 = dct_const_round_shift(s8 - s12);
1150 x13 = dct_const_round_shift(s9 - s13);
1151 x14 = dct_const_round_shift(s10 - s14);
1152 x15 = dct_const_round_shift(s11 - s15);
1153
1154 // stage 3
1155 s0 = x0;
1156 s1 = x1;
1157 s2 = x2;
1158 s3 = x3;
1159 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1160 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1161 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
1162 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1163 s8 = x8;
1164 s9 = x9;
1165 s10 = x10;
1166 s11 = x11;
1167 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1168 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1169 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
1170 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1171
1172 x0 = s0 + s2;
1173 x1 = s1 + s3;
1174 x2 = s0 - s2;
1175 x3 = s1 - s3;
1176 x4 = dct_const_round_shift(s4 + s6);
1177 x5 = dct_const_round_shift(s5 + s7);
1178 x6 = dct_const_round_shift(s4 - s6);
1179 x7 = dct_const_round_shift(s5 - s7);
1180 x8 = s8 + s10;
1181 x9 = s9 + s11;
1182 x10 = s8 - s10;
1183 x11 = s9 - s11;
1184 x12 = dct_const_round_shift(s12 + s14);
1185 x13 = dct_const_round_shift(s13 + s15);
1186 x14 = dct_const_round_shift(s12 - s14);
1187 x15 = dct_const_round_shift(s13 - s15);
1188
1189 // stage 4
1190 s2 = (- cospi_16_64) * (x2 + x3);
1191 s3 = cospi_16_64 * (x2 - x3);
1192 s6 = cospi_16_64 * (x6 + x7);
1193 s7 = cospi_16_64 * (- x6 + x7);
1194 s10 = cospi_16_64 * (x10 + x11);
1195 s11 = cospi_16_64 * (- x10 + x11);
1196 s14 = (- cospi_16_64) * (x14 + x15);
1197 s15 = cospi_16_64 * (x14 - x15);
1198
1199 x2 = dct_const_round_shift(s2);
1200 x3 = dct_const_round_shift(s3);
1201 x6 = dct_const_round_shift(s6);
1202 x7 = dct_const_round_shift(s7);
1203 x10 = dct_const_round_shift(s10);
1204 x11 = dct_const_round_shift(s11);
1205 x14 = dct_const_round_shift(s14);
1206 x15 = dct_const_round_shift(s15);
1207
1208 output[0] = x0;
1209 output[1] = -x8;
1210 output[2] = x12;
1211 output[3] = -x4;
1212 output[4] = x6;
1213 output[5] = x14;
1214 output[6] = x10;
1215 output[7] = x2;
1216 output[8] = x3;
1217 output[9] = x11;
1218 output[10] = x15;
1219 output[11] = x7;
1220 output[12] = x5;
1221 output[13] = -x13;
1222 output[14] = x9;
1223 output[15] = -x1;
1224 }
1225
1226
1227 #endif // HAVE_DSPR2
1228