1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_bi_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
22 int32_t src_stride,
23 uint8_t *dst,
24 int32_t dst_stride,
25 const int16_t *filter_x0,
26 int32_t h) {
27 int32_t y;
28 uint8_t *cm = vpx_ff_cropTbl;
29 uint8_t *dst_ptr;
30 int32_t Temp1, Temp2;
31 uint32_t vector4a = 64;
32 uint32_t tp1, tp2;
33 uint32_t p1, p2;
34 const int16_t *filter = &filter_x0[3];
35 uint32_t filter45;
36
37 filter45 = ((const int32_t *)filter)[0];
38
39 for (y = h; y--;) {
40 dst_ptr = dst;
41 /* prefetch data to cache memory */
42 prefetch_load(src + src_stride);
43 prefetch_load(src + src_stride + 32);
44
45 __asm__ __volatile__ (
46 "ulw %[tp1], 0(%[src]) \n\t"
47 "ulw %[tp2], 4(%[src]) \n\t"
48
49 /* even 1. pixel */
50 "mtlo %[vector4a], $ac3 \n\t"
51 "mthi $zero, $ac3 \n\t"
52 "preceu.ph.qbr %[p1], %[tp1] \n\t"
53 "preceu.ph.qbl %[p2], %[tp1] \n\t"
54 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
55 "extp %[Temp1], $ac3, 31 \n\t"
56
57 /* even 2. pixel */
58 "mtlo %[vector4a], $ac2 \n\t"
59 "mthi $zero, $ac2 \n\t"
60 "balign %[tp2], %[tp1], 3 \n\t"
61 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
62 "extp %[Temp2], $ac2, 31 \n\t"
63
64 /* odd 1. pixel */
65 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
66 "mtlo %[vector4a], $ac3 \n\t"
67 "mthi $zero, $ac3 \n\t"
68 "preceu.ph.qbr %[p1], %[tp2] \n\t"
69 "preceu.ph.qbl %[p2], %[tp2] \n\t"
70 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
71 "extp %[Temp1], $ac3, 31 \n\t"
72
73 /* odd 2. pixel */
74 "lbux %[tp2], %[Temp2](%[cm]) \n\t"
75 "mtlo %[vector4a], $ac2 \n\t"
76 "mthi $zero, $ac2 \n\t"
77 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
78 "extp %[Temp2], $ac2, 31 \n\t"
79
80 /* clamp */
81 "lbux %[p1], %[Temp1](%[cm]) \n\t"
82 "lbux %[p2], %[Temp2](%[cm]) \n\t"
83
84 /* store bytes */
85 "sb %[tp1], 0(%[dst_ptr]) \n\t"
86 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
87
88 "sb %[p1], 0(%[dst_ptr]) \n\t"
89 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
90
91 "sb %[tp2], 0(%[dst_ptr]) \n\t"
92 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
93
94 "sb %[p2], 0(%[dst_ptr]) \n\t"
95 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
96
97 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
98 [p1] "=&r" (p1), [p2] "=&r" (p2),
99 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
100 [dst_ptr] "+r" (dst_ptr)
101 : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
102 [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
103 );
104
105 /* Next row... */
106 src += src_stride;
107 dst += 1;
108 }
109 }
110
convolve_bi_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)111 static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
112 int32_t src_stride,
113 uint8_t *dst,
114 int32_t dst_stride,
115 const int16_t *filter_x0,
116 int32_t h) {
117 int32_t y;
118 uint8_t *cm = vpx_ff_cropTbl;
119 uint8_t *dst_ptr;
120 uint32_t vector4a = 64;
121 int32_t Temp1, Temp2, Temp3;
122 uint32_t tp1, tp2, tp3;
123 uint32_t p1, p2, p3, p4;
124 uint8_t *odd_dst;
125 uint32_t dst_pitch_2 = (dst_stride << 1);
126 const int16_t *filter = &filter_x0[3];
127 uint32_t filter45;
128
129 filter45 = ((const int32_t *)filter)[0];
130
131 for (y = h; y--;) {
132 /* prefetch data to cache memory */
133 prefetch_load(src + src_stride);
134 prefetch_load(src + src_stride + 32);
135
136 dst_ptr = dst;
137 odd_dst = (dst_ptr + dst_stride);
138
139 __asm__ __volatile__ (
140 "ulw %[tp1], 0(%[src]) \n\t"
141 "ulw %[tp2], 4(%[src]) \n\t"
142
143 /* even 1. pixel */
144 "mtlo %[vector4a], $ac3 \n\t"
145 "mthi $zero, $ac3 \n\t"
146 "mtlo %[vector4a], $ac2 \n\t"
147 "mthi $zero, $ac2 \n\t"
148 "preceu.ph.qbr %[p1], %[tp1] \n\t"
149 "preceu.ph.qbl %[p2], %[tp1] \n\t"
150 "preceu.ph.qbr %[p3], %[tp2] \n\t"
151 "preceu.ph.qbl %[p4], %[tp2] \n\t"
152 "ulw %[tp3], 8(%[src]) \n\t"
153 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
154 "extp %[Temp1], $ac3, 31 \n\t"
155
156 /* even 2. pixel */
157 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
158 "extp %[Temp3], $ac2, 31 \n\t"
159
160 /* even 3. pixel */
161 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
162 "mtlo %[vector4a], $ac1 \n\t"
163 "mthi $zero, $ac1 \n\t"
164 "balign %[tp3], %[tp2], 3 \n\t"
165 "balign %[tp2], %[tp1], 3 \n\t"
166 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
167 "lbux %[tp1], %[Temp3](%[cm]) \n\t"
168 "extp %[p3], $ac1, 31 \n\t"
169
170 /* even 4. pixel */
171 "mtlo %[vector4a], $ac2 \n\t"
172 "mthi $zero, $ac2 \n\t"
173 "mtlo %[vector4a], $ac3 \n\t"
174 "mthi $zero, $ac3 \n\t"
175 "sb %[Temp2], 0(%[dst_ptr]) \n\t"
176 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
177 "sb %[tp1], 0(%[dst_ptr]) \n\t"
178 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
179
180 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
181 "extp %[Temp3], $ac2, 31 \n\t"
182
183 "lbux %[Temp1], %[p3](%[cm]) \n\t"
184
185 /* odd 1. pixel */
186 "mtlo %[vector4a], $ac1 \n\t"
187 "mthi $zero, $ac1 \n\t"
188 "preceu.ph.qbr %[p1], %[tp2] \n\t"
189 "preceu.ph.qbl %[p2], %[tp2] \n\t"
190 "preceu.ph.qbr %[p3], %[tp3] \n\t"
191 "preceu.ph.qbl %[p4], %[tp3] \n\t"
192 "sb %[Temp1], 0(%[dst_ptr]) \n\t"
193 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
194
195 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
196 "extp %[Temp2], $ac3, 31 \n\t"
197
198 /* odd 2. pixel */
199 "lbux %[tp1], %[Temp3](%[cm]) \n\t"
200 "mtlo %[vector4a], $ac3 \n\t"
201 "mthi $zero, $ac3 \n\t"
202 "mtlo %[vector4a], $ac2 \n\t"
203 "mthi $zero, $ac2 \n\t"
204 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
205 "sb %[tp1], 0(%[dst_ptr]) \n\t"
206 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
207 "extp %[Temp3], $ac1, 31 \n\t"
208
209 /* odd 3. pixel */
210 "lbux %[tp3], %[Temp2](%[cm]) \n\t"
211 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
212 "extp %[Temp2], $ac3, 31 \n\t"
213
214 /* odd 4. pixel */
215 "sb %[tp3], 0(%[odd_dst]) \n\t"
216 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
217 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
218 "extp %[Temp1], $ac2, 31 \n\t"
219
220 /* clamp */
221 "lbux %[p4], %[Temp3](%[cm]) \n\t"
222 "lbux %[p2], %[Temp2](%[cm]) \n\t"
223 "lbux %[p1], %[Temp1](%[cm]) \n\t"
224
225 /* store bytes */
226 "sb %[p4], 0(%[odd_dst]) \n\t"
227 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
228
229 "sb %[p2], 0(%[odd_dst]) \n\t"
230 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
231
232 "sb %[p1], 0(%[odd_dst]) \n\t"
233
234 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
235 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
236 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
237 [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
238 : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
239 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
240 );
241
242 /* Next row... */
243 src += src_stride;
244 dst += 1;
245 }
246 }
247
convolve_bi_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)248 static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
249 int32_t src_stride,
250 uint8_t *dst_ptr,
251 int32_t dst_stride,
252 const int16_t *filter_x0,
253 int32_t h,
254 int32_t count) {
255 int32_t c, y;
256 const uint8_t *src;
257 uint8_t *dst;
258 uint8_t *cm = vpx_ff_cropTbl;
259 uint32_t vector_64 = 64;
260 int32_t Temp1, Temp2, Temp3;
261 uint32_t qload1, qload2;
262 uint32_t p1, p2, p3, p4, p5;
263 uint32_t st1, st2, st3;
264 uint32_t dst_pitch_2 = (dst_stride << 1);
265 uint8_t *odd_dst;
266 const int16_t *filter = &filter_x0[3];
267 uint32_t filter45;
268
269 filter45 = ((const int32_t *)filter)[0];
270
271 for (y = h; y--;) {
272 /* prefetch data to cache memory */
273 prefetch_load(src_ptr + src_stride);
274 prefetch_load(src_ptr + src_stride + 32);
275
276 src = src_ptr;
277 dst = dst_ptr;
278
279 odd_dst = (dst + dst_stride);
280
281 for (c = 0; c < count; c++) {
282 __asm__ __volatile__ (
283 "ulw %[qload1], 0(%[src]) \n\t"
284 "ulw %[qload2], 4(%[src]) \n\t"
285
286 /* even 1. pixel */
287 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
288 "mthi $zero, $ac1 \n\t"
289 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
290 "mthi $zero, $ac2 \n\t"
291 "preceu.ph.qbr %[p1], %[qload1] \n\t"
292 "preceu.ph.qbl %[p2], %[qload1] \n\t"
293 "preceu.ph.qbr %[p3], %[qload2] \n\t"
294 "preceu.ph.qbl %[p4], %[qload2] \n\t"
295 "ulw %[qload1], 8(%[src]) \n\t"
296 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
297 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
298
299 /* even 2. pixel */
300 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
301 "mthi $zero, $ac3 \n\t"
302 "preceu.ph.qbr %[p1], %[qload1] \n\t"
303 "preceu.ph.qbl %[p5], %[qload1] \n\t"
304 "ulw %[qload2], 12(%[src]) \n\t"
305 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
306 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
307 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
308
309 /* even 3. pixel */
310 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
311 "mthi $zero, $ac1 \n\t"
312 "preceu.ph.qbr %[p2], %[qload2] \n\t"
313 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
314 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
315 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
316 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
317 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
318
319 /* even 4. pixel */
320 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
321 "mthi $zero, $ac2 \n\t"
322 "preceu.ph.qbl %[p3], %[qload2] \n\t"
323 "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
324 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
325 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
326 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
327 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
328
329 /* even 5. pixel */
330 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
331 "mthi $zero, $ac3 \n\t"
332 "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
333 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
334 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
335 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
336 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
337
338 /* even 6. pixel */
339 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
340 "mthi $zero, $ac1 \n\t"
341 "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
342 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
343 "ulw %[qload1], 20(%[src]) \n\t"
344 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
345 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
346 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
347
348 /* even 7. pixel */
349 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
350 "mthi $zero, $ac2 \n\t"
351 "preceu.ph.qbr %[p5], %[qload1] \n\t"
352 "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
353 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
354 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
355 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
356 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
357
358 /* even 8. pixel */
359 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
360 "mthi $zero, $ac3 \n\t"
361 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
362 "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
363 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
364 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
365 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
366
367 /* ODD pixels */
368 "ulw %[qload1], 1(%[src]) \n\t"
369 "ulw %[qload2], 5(%[src]) \n\t"
370
371 /* odd 1. pixel */
372 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
373 "mthi $zero, $ac1 \n\t"
374 "preceu.ph.qbr %[p1], %[qload1] \n\t"
375 "preceu.ph.qbl %[p2], %[qload1] \n\t"
376 "preceu.ph.qbr %[p3], %[qload2] \n\t"
377 "preceu.ph.qbl %[p4], %[qload2] \n\t"
378 "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
379 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
380 "ulw %[qload2], 9(%[src]) \n\t"
381 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
382 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
383 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
384
385 /* odd 2. pixel */
386 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
387 "mthi $zero, $ac2 \n\t"
388 "preceu.ph.qbr %[p1], %[qload2] \n\t"
389 "preceu.ph.qbl %[p5], %[qload2] \n\t"
390 "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
391 "ulw %[qload1], 13(%[src]) \n\t"
392 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
393 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
394 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
395
396 /* odd 3. pixel */
397 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
398 "mthi $zero, $ac3 \n\t"
399 "preceu.ph.qbr %[p2], %[qload1] \n\t"
400 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
401 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
402 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
403 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
404 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
405
406 /* odd 4. pixel */
407 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
408 "mthi $zero, $ac1 \n\t"
409 "preceu.ph.qbl %[p3], %[qload1] \n\t"
410 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
411 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
412 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
413 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
414 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
415
416 /* odd 5. pixel */
417 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
418 "mthi $zero, $ac2 \n\t"
419 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
420 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
421 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
422 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
423 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
424
425 /* odd 6. pixel */
426 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
427 "mthi $zero, $ac3 \n\t"
428 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
429 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
430 "ulw %[qload1], 21(%[src]) \n\t"
431 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
432 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
433 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
434
435 /* odd 7. pixel */
436 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
437 "mthi $zero, $ac1 \n\t"
438 "preceu.ph.qbr %[p5], %[qload1] \n\t"
439 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
440 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
441 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
442 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
443
444 /* odd 8. pixel */
445 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
446 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
447
448 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
449 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
450 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
451
452 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
453 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
454
455 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
456 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
457
458 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
459
460 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
461 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
462 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
463 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
464 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
465 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
466 [cm] "r" (cm),
467 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
468 );
469
470 src += 16;
471 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
472 odd_dst = (dst + dst_stride);
473 }
474
475 /* Next row... */
476 src_ptr += src_stride;
477 dst_ptr += 1;
478 }
479 }
480
convolve_bi_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)481 static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
482 int32_t src_stride,
483 uint8_t *dst_ptr,
484 int32_t dst_stride,
485 const int16_t *filter_x0,
486 int32_t h) {
487 int32_t c, y;
488 const uint8_t *src;
489 uint8_t *dst;
490 uint8_t *cm = vpx_ff_cropTbl;
491 uint32_t vector_64 = 64;
492 int32_t Temp1, Temp2, Temp3;
493 uint32_t qload1, qload2;
494 uint32_t p1, p2, p3, p4, p5;
495 uint32_t st1, st2, st3;
496 uint32_t dst_pitch_2 = (dst_stride << 1);
497 uint8_t *odd_dst;
498 const int16_t *filter = &filter_x0[3];
499 uint32_t filter45;
500
501 filter45 = ((const int32_t *)filter)[0];
502
503 for (y = h; y--;) {
504 /* prefetch data to cache memory */
505 prefetch_load(src_ptr + src_stride);
506 prefetch_load(src_ptr + src_stride + 32);
507 prefetch_load(src_ptr + src_stride + 64);
508
509 src = src_ptr;
510 dst = dst_ptr;
511
512 odd_dst = (dst + dst_stride);
513
514 for (c = 0; c < 4; c++) {
515 __asm__ __volatile__ (
516 "ulw %[qload1], 0(%[src]) \n\t"
517 "ulw %[qload2], 4(%[src]) \n\t"
518
519 /* even 1. pixel */
520 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
521 "mthi $zero, $ac1 \n\t"
522 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
523 "mthi $zero, $ac2 \n\t"
524 "preceu.ph.qbr %[p1], %[qload1] \n\t"
525 "preceu.ph.qbl %[p2], %[qload1] \n\t"
526 "preceu.ph.qbr %[p3], %[qload2] \n\t"
527 "preceu.ph.qbl %[p4], %[qload2] \n\t"
528 "ulw %[qload1], 8(%[src]) \n\t"
529 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
530 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
531
532 /* even 2. pixel */
533 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
534 "mthi $zero, $ac3 \n\t"
535 "preceu.ph.qbr %[p1], %[qload1] \n\t"
536 "preceu.ph.qbl %[p5], %[qload1] \n\t"
537 "ulw %[qload2], 12(%[src]) \n\t"
538 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
539 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
540 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
541
542 /* even 3. pixel */
543 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
544 "mthi $zero, $ac1 \n\t"
545 "preceu.ph.qbr %[p2], %[qload2] \n\t"
546 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
547 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
548 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
549 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
550 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
551
552 /* even 4. pixel */
553 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
554 "mthi $zero, $ac2 \n\t"
555 "preceu.ph.qbl %[p3], %[qload2] \n\t"
556 "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
557 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
558 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
559 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
560 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
561
562 /* even 5. pixel */
563 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
564 "mthi $zero, $ac3 \n\t"
565 "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
566 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
567 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
568 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
569 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
570
571 /* even 6. pixel */
572 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
573 "mthi $zero, $ac1 \n\t"
574 "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
575 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
576 "ulw %[qload1], 20(%[src]) \n\t"
577 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
578 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
579 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
580
581 /* even 7. pixel */
582 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
583 "mthi $zero, $ac2 \n\t"
584 "preceu.ph.qbr %[p5], %[qload1] \n\t"
585 "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
586 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
587 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
588 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
589 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
590
591 /* even 8. pixel */
592 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
593 "mthi $zero, $ac3 \n\t"
594 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
595 "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
596 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
597 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
598 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
599
600 /* ODD pixels */
601 "ulw %[qload1], 1(%[src]) \n\t"
602 "ulw %[qload2], 5(%[src]) \n\t"
603
604 /* odd 1. pixel */
605 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
606 "mthi $zero, $ac1 \n\t"
607 "preceu.ph.qbr %[p1], %[qload1] \n\t"
608 "preceu.ph.qbl %[p2], %[qload1] \n\t"
609 "preceu.ph.qbr %[p3], %[qload2] \n\t"
610 "preceu.ph.qbl %[p4], %[qload2] \n\t"
611 "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
612 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
613 "ulw %[qload2], 9(%[src]) \n\t"
614 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
615 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
616 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
617
618 /* odd 2. pixel */
619 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
620 "mthi $zero, $ac2 \n\t"
621 "preceu.ph.qbr %[p1], %[qload2] \n\t"
622 "preceu.ph.qbl %[p5], %[qload2] \n\t"
623 "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
624 "ulw %[qload1], 13(%[src]) \n\t"
625 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
626 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
627 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
628
629 /* odd 3. pixel */
630 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
631 "mthi $zero, $ac3 \n\t"
632 "preceu.ph.qbr %[p2], %[qload1] \n\t"
633 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
634 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
635 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
636 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
637 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
638
639 /* odd 4. pixel */
640 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
641 "mthi $zero, $ac1 \n\t"
642 "preceu.ph.qbl %[p3], %[qload1] \n\t"
643 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
644 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
645 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
646 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
647 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
648
649 /* odd 5. pixel */
650 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
651 "mthi $zero, $ac2 \n\t"
652 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
653 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
654 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
655 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
656 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
657
658 /* odd 6. pixel */
659 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
660 "mthi $zero, $ac3 \n\t"
661 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
662 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
663 "ulw %[qload1], 21(%[src]) \n\t"
664 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
665 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
666 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
667
668 /* odd 7. pixel */
669 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
670 "mthi $zero, $ac1 \n\t"
671 "preceu.ph.qbr %[p5], %[qload1] \n\t"
672 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
673 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
674 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
675 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
676
677 /* odd 8. pixel */
678 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
679 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
680
681 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
682 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
683 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
684
685 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
686 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
687
688 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
689 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
690
691 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
692
693 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
694 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
695 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
696 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
697 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
698 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
699 [cm] "r" (cm),
700 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
701 );
702
703 src += 16;
704 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
705 odd_dst = (dst + dst_stride);
706 }
707
708 /* Next row... */
709 src_ptr += src_stride;
710 dst_ptr += 1;
711 }
712 }
713
convolve_bi_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)714 void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
715 uint8_t *dst, ptrdiff_t dst_stride,
716 const int16_t *filter, int w, int h) {
717 int x, y;
718
719 for (y = 0; y < h; ++y) {
720 for (x = 0; x < w; ++x) {
721 int sum = 0;
722
723 sum += src[x] * filter[3];
724 sum += src[x + 1] * filter[4];
725
726 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
727 }
728
729 src += src_stride;
730 dst += 1;
731 }
732 }
733
vpx_convolve2_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)734 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
735 uint8_t *dst, ptrdiff_t dst_stride,
736 const int16_t *filter,
737 int w, int h) {
738 uint32_t pos = 38;
739
740 /* bit positon for extract from acc */
741 __asm__ __volatile__ (
742 "wrdsp %[pos], 1 \n\t"
743 :
744 : [pos] "r" (pos)
745 );
746
747 /* prefetch data to cache memory */
748 prefetch_load(src);
749 prefetch_load(src + 32);
750
751 switch (w) {
752 case 4:
753 convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
754 dst, dst_stride,
755 filter, h);
756 break;
757 case 8:
758 convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
759 dst, dst_stride,
760 filter, h);
761 break;
762 case 16:
763 case 32:
764 convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
765 dst, dst_stride,
766 filter, h,
767 (w/16));
768 break;
769 case 64:
770 prefetch_load(src + 32);
771 convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
772 dst, dst_stride,
773 filter, h);
774 break;
775 default:
776 convolve_bi_horiz_transposed(src, src_stride,
777 dst, dst_stride,
778 filter, w, h);
779 break;
780 }
781 }
782 #endif
783