1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_filter.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22 #if HAVE_DSPR2
23 uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
24 uint8_t *vp9_ff_cropTbl;
25
vp9_dsputil_static_init(void)26 void vp9_dsputil_static_init(void) {
27 int i;
28
29 for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i;
30
31 for (i = 0; i < CROP_WIDTH; i++) {
32 vp9_ff_cropTbl_a[i] = 0;
33 vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
34 }
35
36 vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH];
37 }
38
convolve_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)39 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
40 int32_t src_stride,
41 uint8_t *dst,
42 int32_t dst_stride,
43 const int16_t *filter_x0,
44 int32_t h) {
45 int32_t y;
46 uint8_t *cm = vp9_ff_cropTbl;
47 uint8_t *dst_ptr;
48 int32_t vector1b, vector2b, vector3b, vector4b;
49 int32_t Temp1, Temp2, Temp3, Temp4;
50 uint32_t vector4a = 64;
51 uint32_t tp1, tp2;
52 uint32_t p1, p2, p3, p4;
53 uint32_t tn1, tn2;
54
55 vector1b = ((const int32_t *)filter_x0)[0];
56 vector2b = ((const int32_t *)filter_x0)[1];
57 vector3b = ((const int32_t *)filter_x0)[2];
58 vector4b = ((const int32_t *)filter_x0)[3];
59
60 for (y = h; y--;) {
61 dst_ptr = dst;
62 /* prefetch data to cache memory */
63 vp9_prefetch_load(src + src_stride);
64 vp9_prefetch_load(src + src_stride + 32);
65
66 __asm__ __volatile__ (
67 "ulw %[tp1], 0(%[src]) \n\t"
68 "ulw %[tp2], 4(%[src]) \n\t"
69
70 /* even 1. pixel */
71 "mtlo %[vector4a], $ac3 \n\t"
72 "mthi $zero, $ac3 \n\t"
73 "preceu.ph.qbr %[p1], %[tp1] \n\t"
74 "preceu.ph.qbl %[p2], %[tp1] \n\t"
75 "preceu.ph.qbr %[p3], %[tp2] \n\t"
76 "preceu.ph.qbl %[p4], %[tp2] \n\t"
77 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
78 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
79 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
80 "ulw %[tn2], 8(%[src]) \n\t"
81 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
82 "extp %[Temp1], $ac3, 31 \n\t"
83
84 /* even 2. pixel */
85 "mtlo %[vector4a], $ac2 \n\t"
86 "mthi $zero, $ac2 \n\t"
87 "preceu.ph.qbr %[p1], %[tn2] \n\t"
88 "balign %[tn1], %[tn2], 3 \n\t"
89 "balign %[tn2], %[tp2], 3 \n\t"
90 "balign %[tp2], %[tp1], 3 \n\t"
91 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
92 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
93 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
94 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
95 "extp %[Temp3], $ac2, 31 \n\t"
96
97 /* odd 1. pixel */
98 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
99 "mtlo %[vector4a], $ac3 \n\t"
100 "mthi $zero, $ac3 \n\t"
101 "preceu.ph.qbr %[p1], %[tp2] \n\t"
102 "preceu.ph.qbl %[p2], %[tp2] \n\t"
103 "preceu.ph.qbr %[p3], %[tn2] \n\t"
104 "preceu.ph.qbl %[p4], %[tn2] \n\t"
105 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
106 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
107 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
108 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
109 "extp %[Temp2], $ac3, 31 \n\t"
110
111 /* odd 2. pixel */
112 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
113 "mtlo %[vector4a], $ac2 \n\t"
114 "mthi $zero, $ac2 \n\t"
115 "preceu.ph.qbr %[p1], %[tn1] \n\t"
116 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
117 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
118 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
119 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
120 "extp %[Temp4], $ac2, 31 \n\t"
121
122 /* clamp */
123 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
124 "lbux %[p2], %[Temp4](%[cm]) \n\t"
125
126 /* store bytes */
127 "sb %[tp1], 0(%[dst_ptr]) \n\t"
128 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
129
130 "sb %[tn1], 0(%[dst_ptr]) \n\t"
131 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
132
133 "sb %[tp2], 0(%[dst_ptr]) \n\t"
134 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
135
136 "sb %[p2], 0(%[dst_ptr]) \n\t"
137 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
138
139 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
140 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
141 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
142 [dst_ptr] "+r" (dst_ptr)
143 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
144 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
145 [vector4a] "r" (vector4a),
146 [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
147 );
148
149 /* Next row... */
150 src += src_stride;
151 dst += 1;
152 }
153 }
154
convolve_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)155 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
156 int32_t src_stride,
157 uint8_t *dst,
158 int32_t dst_stride,
159 const int16_t *filter_x0,
160 int32_t h) {
161 int32_t y;
162 uint8_t *cm = vp9_ff_cropTbl;
163 uint8_t *dst_ptr;
164 uint32_t vector4a = 64;
165 int32_t vector1b, vector2b, vector3b, vector4b;
166 int32_t Temp1, Temp2, Temp3;
167 uint32_t tp1, tp2, tp3;
168 uint32_t p1, p2, p3, p4, n1;
169 uint8_t *odd_dst;
170 uint32_t dst_pitch_2 = (dst_stride << 1);
171
172 vector1b = ((const int32_t *)filter_x0)[0];
173 vector2b = ((const int32_t *)filter_x0)[1];
174 vector3b = ((const int32_t *)filter_x0)[2];
175 vector4b = ((const int32_t *)filter_x0)[3];
176
177 for (y = h; y--;) {
178 /* prefetch data to cache memory */
179 vp9_prefetch_load(src + src_stride);
180 vp9_prefetch_load(src + src_stride + 32);
181
182 dst_ptr = dst;
183 odd_dst = (dst_ptr + dst_stride);
184
185 __asm__ __volatile__ (
186 "ulw %[tp2], 0(%[src]) \n\t"
187 "ulw %[tp1], 4(%[src]) \n\t"
188
189 /* even 1. pixel */
190 "mtlo %[vector4a], $ac3 \n\t"
191 "mthi $zero, $ac3 \n\t"
192 "mtlo %[vector4a], $ac2 \n\t"
193 "mthi $zero, $ac2 \n\t"
194 "preceu.ph.qbr %[p1], %[tp2] \n\t"
195 "preceu.ph.qbl %[p2], %[tp2] \n\t"
196 "preceu.ph.qbr %[p3], %[tp1] \n\t"
197 "preceu.ph.qbl %[p4], %[tp1] \n\t"
198 "ulw %[tp3], 8(%[src]) \n\t"
199 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
200 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
201 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
202 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
203 "extp %[Temp1], $ac3, 31 \n\t"
204
205 /* even 2. pixel */
206 "preceu.ph.qbr %[p1], %[tp3] \n\t"
207 "preceu.ph.qbl %[n1], %[tp3] \n\t"
208 "ulw %[tp2], 12(%[src]) \n\t"
209 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
210 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
211 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
212 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
213 "extp %[Temp3], $ac2, 31 \n\t"
214
215 /* even 3. pixel */
216 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
217 "mtlo %[vector4a], $ac1 \n\t"
218 "mthi $zero, $ac1 \n\t"
219 "preceu.ph.qbr %[p2], %[tp2] \n\t"
220 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
221 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
222 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
223 "lbux %[tp3], %[Temp3](%[cm]) \n\t"
224 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
225 "extp %[p3], $ac1, 31 \n\t"
226
227 /* even 4. pixel */
228 "mtlo %[vector4a], $ac2 \n\t"
229 "mthi $zero, $ac2 \n\t"
230 "mtlo %[vector4a], $ac3 \n\t"
231 "mthi $zero, $ac3 \n\t"
232 "sb %[Temp2], 0(%[dst_ptr]) \n\t"
233 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
234 "sb %[tp3], 0(%[dst_ptr]) \n\t"
235 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
236
237 "ulw %[tp1], 1(%[src]) \n\t"
238 "ulw %[tp3], 5(%[src]) \n\t"
239
240 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
241 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
242 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
243 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
244 "extp %[Temp3], $ac2, 31 \n\t"
245
246 "lbux %[tp2], %[p3](%[cm]) \n\t"
247
248 /* odd 1. pixel */
249 "mtlo %[vector4a], $ac1 \n\t"
250 "mthi $zero, $ac1 \n\t"
251 "preceu.ph.qbr %[p1], %[tp1] \n\t"
252 "preceu.ph.qbl %[p2], %[tp1] \n\t"
253 "preceu.ph.qbr %[p3], %[tp3] \n\t"
254 "preceu.ph.qbl %[p4], %[tp3] \n\t"
255 "sb %[tp2], 0(%[dst_ptr]) \n\t"
256 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
257 "ulw %[tp2], 9(%[src]) \n\t"
258
259 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
260 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
261 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
262 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
263 "extp %[Temp2], $ac3, 31 \n\t"
264
265 /* odd 2. pixel */
266 "lbux %[tp1], %[Temp3](%[cm]) \n\t"
267 "mtlo %[vector4a], $ac3 \n\t"
268 "mthi $zero, $ac3 \n\t"
269 "mtlo %[vector4a], $ac2 \n\t"
270 "mthi $zero, $ac2 \n\t"
271 "preceu.ph.qbr %[p1], %[tp2] \n\t"
272 "preceu.ph.qbl %[n1], %[tp2] \n\t"
273 "ulw %[Temp1], 13(%[src]) \n\t"
274 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
275 "sb %[tp1], 0(%[dst_ptr]) \n\t"
276 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
277 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
278 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
279 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
280 "extp %[Temp3], $ac1, 31 \n\t"
281
282 /* odd 3. pixel */
283 "lbux %[tp3], %[Temp2](%[cm]) \n\t"
284 "preceu.ph.qbr %[p2], %[Temp1] \n\t"
285 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
286 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
287 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
288 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
289 "extp %[Temp2], $ac3, 31 \n\t"
290
291 /* odd 4. pixel */
292 "sb %[tp3], 0(%[odd_dst]) \n\t"
293 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
294 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
295 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
296 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
297 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
298 "extp %[Temp1], $ac2, 31 \n\t"
299
300 /* clamp */
301 "lbux %[p4], %[Temp3](%[cm]) \n\t"
302 "lbux %[p2], %[Temp2](%[cm]) \n\t"
303 "lbux %[n1], %[Temp1](%[cm]) \n\t"
304
305 /* store bytes */
306 "sb %[p4], 0(%[odd_dst]) \n\t"
307 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
308
309 "sb %[p2], 0(%[odd_dst]) \n\t"
310 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
311
312 "sb %[n1], 0(%[odd_dst]) \n\t"
313
314 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
315 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
316 [n1] "=&r" (n1),
317 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
318 [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
319 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
320 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
321 [vector4a] "r" (vector4a), [cm] "r" (cm),
322 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
323 );
324
325 /* Next row... */
326 src += src_stride;
327 dst += 1;
328 }
329 }
330
convolve_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)331 static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
332 int32_t src_stride,
333 uint8_t *dst_ptr,
334 int32_t dst_stride,
335 const int16_t *filter_x0,
336 int32_t h,
337 int32_t count) {
338 int32_t c, y;
339 const uint8_t *src;
340 uint8_t *dst;
341 uint8_t *cm = vp9_ff_cropTbl;
342 uint32_t vector_64 = 64;
343 int32_t filter12, filter34, filter56, filter78;
344 int32_t Temp1, Temp2, Temp3;
345 uint32_t qload1, qload2;
346 uint32_t p1, p2, p3, p4, p5;
347 uint32_t st1, st2, st3;
348 uint32_t dst_pitch_2 = (dst_stride << 1);
349 uint8_t *odd_dst;
350
351 filter12 = ((const int32_t *)filter_x0)[0];
352 filter34 = ((const int32_t *)filter_x0)[1];
353 filter56 = ((const int32_t *)filter_x0)[2];
354 filter78 = ((const int32_t *)filter_x0)[3];
355
356 for (y = h; y--;) {
357 /* prefetch data to cache memory */
358 vp9_prefetch_load(src_ptr + src_stride);
359 vp9_prefetch_load(src_ptr + src_stride + 32);
360
361 src = src_ptr;
362 dst = dst_ptr;
363
364 odd_dst = (dst + dst_stride);
365
366 for (c = 0; c < count; c++) {
367 __asm__ __volatile__ (
368 "ulw %[qload1], 0(%[src]) \n\t"
369 "ulw %[qload2], 4(%[src]) \n\t"
370
371 /* even 1. pixel */
372 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
373 "mthi $zero, $ac1 \n\t"
374 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
375 "mthi $zero, $ac2 \n\t"
376 "preceu.ph.qbr %[p3], %[qload2] \n\t"
377 "preceu.ph.qbl %[p4], %[qload2] \n\t"
378 "preceu.ph.qbr %[p1], %[qload1] \n\t"
379 "preceu.ph.qbl %[p2], %[qload1] \n\t"
380 "ulw %[qload2], 8(%[src]) \n\t"
381 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
382 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
383 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
384 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
385 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
386
387 /* even 2. pixel */
388 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
389 "mthi $zero, $ac3 \n\t"
390 "preceu.ph.qbr %[p1], %[qload2] \n\t"
391 "preceu.ph.qbl %[p5], %[qload2] \n\t"
392 "ulw %[qload1], 12(%[src]) \n\t"
393 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
394 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
395 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
396 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
397 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
398 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
399
400 /* even 3. pixel */
401 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
402 "mthi $zero, $ac1 \n\t"
403 "preceu.ph.qbr %[p2], %[qload1] \n\t"
404 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
405 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
406 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
407 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
408 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
409 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
410 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
411 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
412
413 /* even 4. pixel */
414 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
415 "mthi $zero, $ac2 \n\t"
416 "preceu.ph.qbl %[p3], %[qload1] \n\t"
417 "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
418 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
419 "ulw %[qload2], 16(%[src]) \n\t"
420 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
421 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
422 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
423 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
424 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
425 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
426
427 /* even 5. pixel */
428 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
429 "mthi $zero, $ac3 \n\t"
430 "preceu.ph.qbr %[p4], %[qload2] \n\t"
431 "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
432 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
433 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
434 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
435 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
436 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
437 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
438 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
439
440 /* even 6. pixel */
441 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
442 "mthi $zero, $ac1 \n\t"
443 "preceu.ph.qbl %[p1], %[qload2] \n\t"
444 "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
445 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
446 "ulw %[qload1], 20(%[src]) \n\t"
447 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
448 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
449 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
450 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
451 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
452 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
453
454 /* even 7. pixel */
455 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
456 "mthi $zero, $ac2 \n\t"
457 "preceu.ph.qbr %[p5], %[qload1] \n\t"
458 "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
459 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
460 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
461 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
462 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
463 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
464 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
465 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
466
467 /* even 8. pixel */
468 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
469 "mthi $zero, $ac3 \n\t"
470 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
471 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
472 "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
473 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
474 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
475 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
476 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
477 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
478
479 /* ODD pixels */
480 "ulw %[qload1], 1(%[src]) \n\t"
481 "ulw %[qload2], 5(%[src]) \n\t"
482
483 /* odd 1. pixel */
484 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
485 "mthi $zero, $ac1 \n\t"
486 "preceu.ph.qbr %[p1], %[qload1] \n\t"
487 "preceu.ph.qbl %[p2], %[qload1] \n\t"
488 "preceu.ph.qbr %[p3], %[qload2] \n\t"
489 "preceu.ph.qbl %[p4], %[qload2] \n\t"
490 "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
491 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
492 "ulw %[qload2], 9(%[src]) \n\t"
493 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
494 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
495 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
496 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
497 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
498 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
499
500 /* odd 2. pixel */
501 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
502 "mthi $zero, $ac2 \n\t"
503 "preceu.ph.qbr %[p1], %[qload2] \n\t"
504 "preceu.ph.qbl %[p5], %[qload2] \n\t"
505 "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
506 "ulw %[qload1], 13(%[src]) \n\t"
507 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
508 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
509 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
510 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
511 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
512 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
513
514 /* odd 3. pixel */
515 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
516 "mthi $zero, $ac3 \n\t"
517 "preceu.ph.qbr %[p2], %[qload1] \n\t"
518 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
519 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
520 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
521 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
522 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
523 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
524 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
525 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
526
527 /* odd 4. pixel */
528 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
529 "mthi $zero, $ac1 \n\t"
530 "preceu.ph.qbl %[p3], %[qload1] \n\t"
531 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
532 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
533 "ulw %[qload2], 17(%[src]) \n\t"
534 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
535 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
536 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
537 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
538 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
539 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
540
541 /* odd 5. pixel */
542 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
543 "mthi $zero, $ac2 \n\t"
544 "preceu.ph.qbr %[p4], %[qload2] \n\t"
545 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
546 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
547 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
548 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
549 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
550 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
551 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
552 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
553
554 /* odd 6. pixel */
555 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
556 "mthi $zero, $ac3 \n\t"
557 "preceu.ph.qbl %[p1], %[qload2] \n\t"
558 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
559 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
560 "ulw %[qload1], 21(%[src]) \n\t"
561 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
562 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
563 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
564 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
565 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
566 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
567
568 /* odd 7. pixel */
569 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
570 "mthi $zero, $ac1 \n\t"
571 "preceu.ph.qbr %[p5], %[qload1] \n\t"
572 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
573 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
574 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
575 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
576 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
577 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
578 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
579
580 /* odd 8. pixel */
581 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
582 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
583 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
584 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
585 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
586
587 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
588 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
589 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
590
591 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
592 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
593
594 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
595 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
596
597 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
598
599 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
600 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
601 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
602 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
603 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
604 : [filter12] "r" (filter12), [filter34] "r" (filter34),
605 [filter56] "r" (filter56), [filter78] "r" (filter78),
606 [vector_64] "r" (vector_64), [cm] "r" (cm),
607 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
608 );
609
610 src += 16;
611 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
612 odd_dst = (dst + dst_stride);
613 }
614
615 /* Next row... */
616 src_ptr += src_stride;
617
618 dst_ptr += 1;
619 }
620 }
621
convolve_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)622 static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
623 int32_t src_stride,
624 uint8_t *dst_ptr,
625 int32_t dst_stride,
626 const int16_t *filter_x0,
627 int32_t h) {
628 int32_t c, y;
629 const uint8_t *src;
630 uint8_t *dst;
631 uint8_t *cm = vp9_ff_cropTbl;
632 uint32_t vector_64 = 64;
633 int32_t filter12, filter34, filter56, filter78;
634 int32_t Temp1, Temp2, Temp3;
635 uint32_t qload1, qload2;
636 uint32_t p1, p2, p3, p4, p5;
637 uint32_t st1, st2, st3;
638 uint32_t dst_pitch_2 = (dst_stride << 1);
639 uint8_t *odd_dst;
640
641 filter12 = ((const int32_t *)filter_x0)[0];
642 filter34 = ((const int32_t *)filter_x0)[1];
643 filter56 = ((const int32_t *)filter_x0)[2];
644 filter78 = ((const int32_t *)filter_x0)[3];
645
646 for (y = h; y--;) {
647 /* prefetch data to cache memory */
648 vp9_prefetch_load(src_ptr + src_stride);
649 vp9_prefetch_load(src_ptr + src_stride + 32);
650 vp9_prefetch_load(src_ptr + src_stride + 64);
651
652 src = src_ptr;
653 dst = dst_ptr;
654
655 odd_dst = (dst + dst_stride);
656
657 for (c = 0; c < 4; c++) {
658 __asm__ __volatile__ (
659 "ulw %[qload1], 0(%[src]) \n\t"
660 "ulw %[qload2], 4(%[src]) \n\t"
661
662 /* even 1. pixel */
663 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
664 "mthi $zero, $ac1 \n\t"
665 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
666 "mthi $zero, $ac2 \n\t"
667 "preceu.ph.qbr %[p3], %[qload2] \n\t"
668 "preceu.ph.qbl %[p4], %[qload2] \n\t"
669 "preceu.ph.qbr %[p1], %[qload1] \n\t"
670 "preceu.ph.qbl %[p2], %[qload1] \n\t"
671 "ulw %[qload2], 8(%[src]) \n\t"
672 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
673 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
674 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
675 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
676 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
677
678 /* even 2. pixel */
679 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
680 "mthi $zero, $ac3 \n\t"
681 "preceu.ph.qbr %[p1], %[qload2] \n\t"
682 "preceu.ph.qbl %[p5], %[qload2] \n\t"
683 "ulw %[qload1], 12(%[src]) \n\t"
684 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
685 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
686 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
687 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
688 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
689 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
690
691 /* even 3. pixel */
692 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
693 "mthi $zero, $ac1 \n\t"
694 "preceu.ph.qbr %[p2], %[qload1] \n\t"
695 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
696 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
697 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
698 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
699 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
700 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
701 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
702 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
703
704 /* even 4. pixel */
705 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
706 "mthi $zero, $ac2 \n\t"
707 "preceu.ph.qbl %[p3], %[qload1] \n\t"
708 "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
709 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
710 "ulw %[qload2], 16(%[src]) \n\t"
711 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
712 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
713 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
714 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
715 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
716 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
717
718 /* even 5. pixel */
719 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
720 "mthi $zero, $ac3 \n\t"
721 "preceu.ph.qbr %[p4], %[qload2] \n\t"
722 "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
723 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
724 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
725 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
726 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
727 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
728 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
729 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
730
731 /* even 6. pixel */
732 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
733 "mthi $zero, $ac1 \n\t"
734 "preceu.ph.qbl %[p1], %[qload2] \n\t"
735 "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
736 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
737 "ulw %[qload1], 20(%[src]) \n\t"
738 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
739 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
740 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
741 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
742 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
743 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
744
745 /* even 7. pixel */
746 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
747 "mthi $zero, $ac2 \n\t"
748 "preceu.ph.qbr %[p5], %[qload1] \n\t"
749 "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
750 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
751 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
752 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
753 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
754 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
755 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
756 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
757
758 /* even 8. pixel */
759 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
760 "mthi $zero, $ac3 \n\t"
761 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
762 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
763 "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
764 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
765 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
766 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
767 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
768 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
769
770 /* ODD pixels */
771 "ulw %[qload1], 1(%[src]) \n\t"
772 "ulw %[qload2], 5(%[src]) \n\t"
773
774 /* odd 1. pixel */
775 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
776 "mthi $zero, $ac1 \n\t"
777 "preceu.ph.qbr %[p1], %[qload1] \n\t"
778 "preceu.ph.qbl %[p2], %[qload1] \n\t"
779 "preceu.ph.qbr %[p3], %[qload2] \n\t"
780 "preceu.ph.qbl %[p4], %[qload2] \n\t"
781 "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
782 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
783 "ulw %[qload2], 9(%[src]) \n\t"
784 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
785 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
786 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
787 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
788 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
789 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
790
791 /* odd 2. pixel */
792 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
793 "mthi $zero, $ac2 \n\t"
794 "preceu.ph.qbr %[p1], %[qload2] \n\t"
795 "preceu.ph.qbl %[p5], %[qload2] \n\t"
796 "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
797 "ulw %[qload1], 13(%[src]) \n\t"
798 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
799 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
800 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
801 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
802 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
803 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
804
805 /* odd 3. pixel */
806 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
807 "mthi $zero, $ac3 \n\t"
808 "preceu.ph.qbr %[p2], %[qload1] \n\t"
809 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
810 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
811 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
812 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
813 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
814 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
815 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
816 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
817
818 /* odd 4. pixel */
819 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
820 "mthi $zero, $ac1 \n\t"
821 "preceu.ph.qbl %[p3], %[qload1] \n\t"
822 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
823 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
824 "ulw %[qload2], 17(%[src]) \n\t"
825 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
826 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
827 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
828 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
829 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
830 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
831
832 /* odd 5. pixel */
833 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
834 "mthi $zero, $ac2 \n\t"
835 "preceu.ph.qbr %[p4], %[qload2] \n\t"
836 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
837 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
838 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
839 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
840 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
841 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
842 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
843 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
844
845 /* odd 6. pixel */
846 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
847 "mthi $zero, $ac3 \n\t"
848 "preceu.ph.qbl %[p1], %[qload2] \n\t"
849 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
850 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
851 "ulw %[qload1], 21(%[src]) \n\t"
852 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
853 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
854 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
855 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
856 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
857 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
858
859 /* odd 7. pixel */
860 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
861 "mthi $zero, $ac1 \n\t"
862 "preceu.ph.qbr %[p5], %[qload1] \n\t"
863 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
864 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
865 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
866 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
867 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
868 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
869 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
870
871 /* odd 8. pixel */
872 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
873 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
874 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
875 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
876 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
877
878 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
879 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
880 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
881
882 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
883 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
884
885 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
886 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
887
888 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
889
890 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
891 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
892 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
893 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
894 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
895 : [filter12] "r" (filter12), [filter34] "r" (filter34),
896 [filter56] "r" (filter56), [filter78] "r" (filter78),
897 [vector_64] "r" (vector_64), [cm] "r" (cm),
898 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
899 );
900
901 src += 16;
902 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
903 odd_dst = (dst + dst_stride);
904 }
905
906 /* Next row... */
907 src_ptr += src_stride;
908
909 dst_ptr += 1;
910 }
911 }
912
convolve_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)913 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
914 uint8_t *dst, ptrdiff_t dst_stride,
915 const int16_t *filter, int w, int h) {
916 int x, y, k;
917
918 for (y = 0; y < h; ++y) {
919 for (x = 0; x < w; ++x) {
920 int sum = 0;
921
922 for (k = 0; k < 8; ++k)
923 sum += src[x + k] * filter[k];
924
925 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
926 }
927
928 src += src_stride;
929 dst += 1;
930 }
931 }
932
copy_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)933 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
934 uint8_t *dst, ptrdiff_t dst_stride,
935 int w, int h) {
936 int x, y;
937
938 for (y = 0; y < h; ++y) {
939 for (x = 0; x < w; ++x) {
940 dst[x * dst_stride] = src[x];
941 }
942
943 src += src_stride;
944 dst += 1;
945 }
946 }
947
vp9_convolve8_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)948 void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
949 uint8_t *dst, ptrdiff_t dst_stride,
950 const int16_t *filter_x, int x_step_q4,
951 const int16_t *filter_y, int y_step_q4,
952 int w, int h) {
953 DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
954 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
955 uint32_t pos = 38;
956
957 /* bit positon for extract from acc */
958 __asm__ __volatile__ (
959 "wrdsp %[pos], 1 \n\t"
960 :
961 : [pos] "r" (pos)
962 );
963
964 if (intermediate_height < h)
965 intermediate_height = h;
966
967 if (x_step_q4 != 16 || y_step_q4 != 16)
968 return vp9_convolve8_c(src, src_stride,
969 dst, dst_stride,
970 filter_x, x_step_q4,
971 filter_y, y_step_q4,
972 w, h);
973
974 if ((((const int32_t *)filter_x)[1] == 0x800000)
975 && (((const int32_t *)filter_y)[1] == 0x800000))
976 return vp9_convolve_copy(src, src_stride,
977 dst, dst_stride,
978 filter_x, x_step_q4,
979 filter_y, y_step_q4,
980 w, h);
981
982 /* copy the src to dst */
983 if (filter_x[3] == 0x80) {
984 copy_horiz_transposed(src - src_stride * 3, src_stride,
985 temp, intermediate_height,
986 w, intermediate_height);
987 } else if (((const int32_t *)filter_x)[0] == 0) {
988 vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
989 temp, intermediate_height,
990 filter_x,
991 w, intermediate_height);
992 } else {
993 src -= (src_stride * 3 + 3);
994
995 /* prefetch data to cache memory */
996 vp9_prefetch_load(src);
997 vp9_prefetch_load(src + 32);
998
999 switch (w) {
1000 case 4:
1001 convolve_horiz_4_transposed_dspr2(src, src_stride,
1002 temp, intermediate_height,
1003 filter_x, intermediate_height);
1004 break;
1005 case 8:
1006 convolve_horiz_8_transposed_dspr2(src, src_stride,
1007 temp, intermediate_height,
1008 filter_x, intermediate_height);
1009 break;
1010 case 16:
1011 case 32:
1012 convolve_horiz_16_transposed_dspr2(src, src_stride,
1013 temp, intermediate_height,
1014 filter_x, intermediate_height,
1015 (w/16));
1016 break;
1017 case 64:
1018 vp9_prefetch_load(src + 32);
1019 convolve_horiz_64_transposed_dspr2(src, src_stride,
1020 temp, intermediate_height,
1021 filter_x, intermediate_height);
1022 break;
1023 default:
1024 convolve_horiz_transposed(src, src_stride,
1025 temp, intermediate_height,
1026 filter_x, w, intermediate_height);
1027 break;
1028 }
1029 }
1030
1031 /* copy the src to dst */
1032 if (filter_y[3] == 0x80) {
1033 copy_horiz_transposed(temp + 3, intermediate_height,
1034 dst, dst_stride,
1035 h, w);
1036 } else if (((const int32_t *)filter_y)[0] == 0) {
1037 vp9_convolve2_dspr2(temp + 3, intermediate_height,
1038 dst, dst_stride,
1039 filter_y,
1040 h, w);
1041 } else {
1042 switch (h) {
1043 case 4:
1044 convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
1045 dst, dst_stride,
1046 filter_y, w);
1047 break;
1048 case 8:
1049 convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
1050 dst, dst_stride,
1051 filter_y, w);
1052 break;
1053 case 16:
1054 case 32:
1055 convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
1056 dst, dst_stride,
1057 filter_y, w, (h/16));
1058 break;
1059 case 64:
1060 convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
1061 dst, dst_stride,
1062 filter_y, w);
1063 break;
1064 default:
1065 convolve_horiz_transposed(temp, intermediate_height,
1066 dst, dst_stride,
1067 filter_y, h, w);
1068 break;
1069 }
1070 }
1071 }
1072
vp9_convolve_copy_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)1073 void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
1074 uint8_t *dst, ptrdiff_t dst_stride,
1075 const int16_t *filter_x, int filter_x_stride,
1076 const int16_t *filter_y, int filter_y_stride,
1077 int w, int h) {
1078 int x, y;
1079
1080 /* prefetch data to cache memory */
1081 vp9_prefetch_load(src);
1082 vp9_prefetch_load(src + 32);
1083 vp9_prefetch_store(dst);
1084
1085 switch (w) {
1086 case 4:
1087 {
1088 uint32_t tp1;
1089
1090 /* 1 word storage */
1091 for (y = h; y--; ) {
1092 vp9_prefetch_load(src + src_stride);
1093 vp9_prefetch_load(src + src_stride + 32);
1094 vp9_prefetch_store(dst + dst_stride);
1095
1096 __asm__ __volatile__ (
1097 "ulw %[tp1], (%[src]) \n\t"
1098 "sw %[tp1], (%[dst]) \n\t" /* store */
1099
1100 : [tp1] "=&r" (tp1)
1101 : [src] "r" (src), [dst] "r" (dst)
1102 );
1103
1104 src += src_stride;
1105 dst += dst_stride;
1106 }
1107 }
1108 break;
1109 case 8:
1110 {
1111 uint32_t tp1, tp2;
1112
1113 /* 2 word storage */
1114 for (y = h; y--; ) {
1115 vp9_prefetch_load(src + src_stride);
1116 vp9_prefetch_load(src + src_stride + 32);
1117 vp9_prefetch_store(dst + dst_stride);
1118
1119 __asm__ __volatile__ (
1120 "ulw %[tp1], 0(%[src]) \n\t"
1121 "ulw %[tp2], 4(%[src]) \n\t"
1122 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1123 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1124
1125 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
1126 : [src] "r" (src), [dst] "r" (dst)
1127 );
1128
1129 src += src_stride;
1130 dst += dst_stride;
1131 }
1132 }
1133 break;
1134 case 16:
1135 {
1136 uint32_t tp1, tp2, tp3, tp4;
1137
1138 /* 4 word storage */
1139 for (y = h; y--; ) {
1140 vp9_prefetch_load(src + src_stride);
1141 vp9_prefetch_load(src + src_stride + 32);
1142 vp9_prefetch_store(dst + dst_stride);
1143
1144 __asm__ __volatile__ (
1145 "ulw %[tp1], 0(%[src]) \n\t"
1146 "ulw %[tp2], 4(%[src]) \n\t"
1147 "ulw %[tp3], 8(%[src]) \n\t"
1148 "ulw %[tp4], 12(%[src]) \n\t"
1149
1150 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1151 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1152 "sw %[tp3], 8(%[dst]) \n\t" /* store */
1153 "sw %[tp4], 12(%[dst]) \n\t" /* store */
1154
1155 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1156 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
1157 : [src] "r" (src), [dst] "r" (dst)
1158 );
1159
1160 src += src_stride;
1161 dst += dst_stride;
1162 }
1163 }
1164 break;
1165 case 32:
1166 {
1167 uint32_t tp1, tp2, tp3, tp4;
1168 uint32_t tp5, tp6, tp7, tp8;
1169
1170 /* 8 word storage */
1171 for (y = h; y--; ) {
1172 vp9_prefetch_load(src + src_stride);
1173 vp9_prefetch_load(src + src_stride + 32);
1174 vp9_prefetch_store(dst + dst_stride);
1175
1176 __asm__ __volatile__ (
1177 "ulw %[tp1], 0(%[src]) \n\t"
1178 "ulw %[tp2], 4(%[src]) \n\t"
1179 "ulw %[tp3], 8(%[src]) \n\t"
1180 "ulw %[tp4], 12(%[src]) \n\t"
1181 "ulw %[tp5], 16(%[src]) \n\t"
1182 "ulw %[tp6], 20(%[src]) \n\t"
1183 "ulw %[tp7], 24(%[src]) \n\t"
1184 "ulw %[tp8], 28(%[src]) \n\t"
1185
1186 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1187 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1188 "sw %[tp3], 8(%[dst]) \n\t" /* store */
1189 "sw %[tp4], 12(%[dst]) \n\t" /* store */
1190 "sw %[tp5], 16(%[dst]) \n\t" /* store */
1191 "sw %[tp6], 20(%[dst]) \n\t" /* store */
1192 "sw %[tp7], 24(%[dst]) \n\t" /* store */
1193 "sw %[tp8], 28(%[dst]) \n\t" /* store */
1194
1195 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1196 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
1197 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
1198 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
1199 : [src] "r" (src), [dst] "r" (dst)
1200 );
1201
1202 src += src_stride;
1203 dst += dst_stride;
1204 }
1205 }
1206 break;
1207 case 64:
1208 {
1209 uint32_t tp1, tp2, tp3, tp4;
1210 uint32_t tp5, tp6, tp7, tp8;
1211
1212 vp9_prefetch_load(src + 64);
1213 vp9_prefetch_store(dst + 32);
1214
1215 /* 16 word storage */
1216 for (y = h; y--; ) {
1217 vp9_prefetch_load(src + src_stride);
1218 vp9_prefetch_load(src + src_stride + 32);
1219 vp9_prefetch_load(src + src_stride + 64);
1220 vp9_prefetch_store(dst + dst_stride);
1221 vp9_prefetch_store(dst + dst_stride + 32);
1222
1223 __asm__ __volatile__ (
1224 "ulw %[tp1], 0(%[src]) \n\t"
1225 "ulw %[tp2], 4(%[src]) \n\t"
1226 "ulw %[tp3], 8(%[src]) \n\t"
1227 "ulw %[tp4], 12(%[src]) \n\t"
1228 "ulw %[tp5], 16(%[src]) \n\t"
1229 "ulw %[tp6], 20(%[src]) \n\t"
1230 "ulw %[tp7], 24(%[src]) \n\t"
1231 "ulw %[tp8], 28(%[src]) \n\t"
1232
1233 "sw %[tp1], 0(%[dst]) \n\t" /* store */
1234 "sw %[tp2], 4(%[dst]) \n\t" /* store */
1235 "sw %[tp3], 8(%[dst]) \n\t" /* store */
1236 "sw %[tp4], 12(%[dst]) \n\t" /* store */
1237 "sw %[tp5], 16(%[dst]) \n\t" /* store */
1238 "sw %[tp6], 20(%[dst]) \n\t" /* store */
1239 "sw %[tp7], 24(%[dst]) \n\t" /* store */
1240 "sw %[tp8], 28(%[dst]) \n\t" /* store */
1241
1242 "ulw %[tp1], 32(%[src]) \n\t"
1243 "ulw %[tp2], 36(%[src]) \n\t"
1244 "ulw %[tp3], 40(%[src]) \n\t"
1245 "ulw %[tp4], 44(%[src]) \n\t"
1246 "ulw %[tp5], 48(%[src]) \n\t"
1247 "ulw %[tp6], 52(%[src]) \n\t"
1248 "ulw %[tp7], 56(%[src]) \n\t"
1249 "ulw %[tp8], 60(%[src]) \n\t"
1250
1251 "sw %[tp1], 32(%[dst]) \n\t" /* store */
1252 "sw %[tp2], 36(%[dst]) \n\t" /* store */
1253 "sw %[tp3], 40(%[dst]) \n\t" /* store */
1254 "sw %[tp4], 44(%[dst]) \n\t" /* store */
1255 "sw %[tp5], 48(%[dst]) \n\t" /* store */
1256 "sw %[tp6], 52(%[dst]) \n\t" /* store */
1257 "sw %[tp7], 56(%[dst]) \n\t" /* store */
1258 "sw %[tp8], 60(%[dst]) \n\t" /* store */
1259
1260 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1261 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
1262 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
1263 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
1264 : [src] "r" (src), [dst] "r" (dst)
1265 );
1266
1267 src += src_stride;
1268 dst += dst_stride;
1269 }
1270 }
1271 break;
1272 default:
1273 for (y = h; y--; ) {
1274 for (x = 0; x < w; ++x) {
1275 dst[x] = src[x];
1276 }
1277
1278 src += src_stride;
1279 dst += dst_stride;
1280 }
1281 break;
1282 }
1283 }
1284 #endif
1285