1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_bi_avg_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
22 int32_t src_stride,
23 uint8_t *dst,
24 int32_t dst_stride,
25 const int16_t *filter_x0,
26 int32_t h) {
27 int32_t y;
28 uint8_t *cm = vpx_ff_cropTbl;
29 int32_t Temp1, Temp2, Temp3, Temp4;
30 uint32_t vector4a = 64;
31 uint32_t tp1, tp2;
32 uint32_t p1, p2, p3;
33 uint32_t tn1, tn2;
34 const int16_t *filter = &filter_x0[3];
35 uint32_t filter45;
36
37 filter45 = ((const int32_t *)filter)[0];
38
39 for (y = h; y--;) {
40 /* prefetch data to cache memory */
41 prefetch_load(src + src_stride);
42 prefetch_load(src + src_stride + 32);
43 prefetch_store(dst + dst_stride);
44
45 __asm__ __volatile__ (
46 "ulw %[tp1], 0(%[src]) \n\t"
47 "ulw %[tp2], 4(%[src]) \n\t"
48
49 /* even 1. pixel */
50 "mtlo %[vector4a], $ac3 \n\t"
51 "mthi $zero, $ac3 \n\t"
52 "preceu.ph.qbr %[p1], %[tp1] \n\t"
53 "preceu.ph.qbl %[p2], %[tp1] \n\t"
54 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
55 "extp %[Temp1], $ac3, 31 \n\t"
56
57 /* even 2. pixel */
58 "mtlo %[vector4a], $ac2 \n\t"
59 "mthi $zero, $ac2 \n\t"
60 "balign %[tp2], %[tp1], 3 \n\t"
61 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
62 "extp %[Temp3], $ac2, 31 \n\t"
63
64 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
65
66 /* odd 1. pixel */
67 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
68 "mtlo %[vector4a], $ac3 \n\t"
69 "mthi $zero, $ac3 \n\t"
70 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
71 "preceu.ph.qbr %[p1], %[tp2] \n\t"
72 "preceu.ph.qbl %[p3], %[tp2] \n\t"
73 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
74 "extp %[Temp2], $ac3, 31 \n\t"
75
76 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
77
78 /* odd 2. pixel */
79 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
80 "mtlo %[vector4a], $ac2 \n\t"
81 "mthi $zero, $ac2 \n\t"
82 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
83 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
84 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
85 "extp %[Temp4], $ac2, 31 \n\t"
86
87 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
88 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
89
90 /* clamp */
91 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
92 "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
93 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
94
95 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
96 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
97
98 "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
99 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
100
101 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
102 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
103 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
104 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
105 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
106 : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
107 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
108 );
109
110 /* Next row... */
111 src += src_stride;
112 dst += dst_stride;
113 }
114 }
115
convolve_bi_avg_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)116 static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
117 int32_t src_stride,
118 uint8_t *dst,
119 int32_t dst_stride,
120 const int16_t *filter_x0,
121 int32_t h) {
122 int32_t y;
123 uint8_t *cm = vpx_ff_cropTbl;
124 uint32_t vector4a = 64;
125 int32_t Temp1, Temp2, Temp3;
126 uint32_t tp1, tp2, tp3, tp4;
127 uint32_t p1, p2, p3, p4, n1;
128 uint32_t st0, st1;
129 const int16_t *filter = &filter_x0[3];
130 uint32_t filter45;;
131
132 filter45 = ((const int32_t *)filter)[0];
133
134 for (y = h; y--;) {
135 /* prefetch data to cache memory */
136 prefetch_load(src + src_stride);
137 prefetch_load(src + src_stride + 32);
138 prefetch_store(dst + dst_stride);
139
140 __asm__ __volatile__ (
141 "ulw %[tp1], 0(%[src]) \n\t"
142 "ulw %[tp2], 4(%[src]) \n\t"
143
144 /* even 1. pixel */
145 "mtlo %[vector4a], $ac3 \n\t"
146 "mthi $zero, $ac3 \n\t"
147 "mtlo %[vector4a], $ac2 \n\t"
148 "mthi $zero, $ac2 \n\t"
149 "preceu.ph.qbr %[p1], %[tp1] \n\t"
150 "preceu.ph.qbl %[p2], %[tp1] \n\t"
151 "preceu.ph.qbr %[p3], %[tp2] \n\t"
152 "preceu.ph.qbl %[p4], %[tp2] \n\t"
153 "ulw %[tp3], 8(%[src]) \n\t"
154 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
155 "extp %[Temp1], $ac3, 31 \n\t"
156 "lbu %[Temp2], 0(%[dst]) \n\t"
157 "lbu %[tp4], 2(%[dst]) \n\t"
158
159 /* even 2. pixel */
160 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
161 "extp %[Temp3], $ac2, 31 \n\t"
162
163 /* even 3. pixel */
164 "lbux %[st0], %[Temp1](%[cm]) \n\t"
165 "mtlo %[vector4a], $ac1 \n\t"
166 "mthi $zero, $ac1 \n\t"
167 "lbux %[st1], %[Temp3](%[cm]) \n\t"
168 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
169 "extp %[Temp1], $ac1, 31 \n\t"
170
171 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
172 "addqh_r.w %[tp4], %[tp4], %[st1] \n\t"
173 "sb %[Temp2], 0(%[dst]) \n\t"
174 "sb %[tp4], 2(%[dst]) \n\t"
175
176 /* even 4. pixel */
177 "mtlo %[vector4a], $ac2 \n\t"
178 "mthi $zero, $ac2 \n\t"
179 "mtlo %[vector4a], $ac3 \n\t"
180 "mthi $zero, $ac3 \n\t"
181
182 "balign %[tp3], %[tp2], 3 \n\t"
183 "balign %[tp2], %[tp1], 3 \n\t"
184
185 "lbux %[st0], %[Temp1](%[cm]) \n\t"
186 "lbu %[Temp2], 4(%[dst]) \n\t"
187 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
188
189 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
190 "extp %[Temp3], $ac2, 31 \n\t"
191
192 /* odd 1. pixel */
193 "mtlo %[vector4a], $ac1 \n\t"
194 "mthi $zero, $ac1 \n\t"
195 "sb %[Temp2], 4(%[dst]) \n\t"
196 "preceu.ph.qbr %[p1], %[tp2] \n\t"
197 "preceu.ph.qbl %[p2], %[tp2] \n\t"
198 "preceu.ph.qbr %[p3], %[tp3] \n\t"
199 "preceu.ph.qbl %[p4], %[tp3] \n\t"
200 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
201 "extp %[Temp2], $ac3, 31 \n\t"
202
203 "lbu %[tp1], 6(%[dst]) \n\t"
204
205 /* odd 2. pixel */
206 "mtlo %[vector4a], $ac3 \n\t"
207 "mthi $zero, $ac3 \n\t"
208 "mtlo %[vector4a], $ac2 \n\t"
209 "mthi $zero, $ac2 \n\t"
210 "lbux %[st0], %[Temp3](%[cm]) \n\t"
211 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
212 "extp %[Temp3], $ac1, 31 \n\t"
213
214 "lbu %[tp2], 1(%[dst]) \n\t"
215 "lbu %[tp3], 3(%[dst]) \n\t"
216 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
217
218 /* odd 3. pixel */
219 "lbux %[st1], %[Temp2](%[cm]) \n\t"
220 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
221 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
222 "extp %[Temp2], $ac3, 31 \n\t"
223
224 "lbu %[tp4], 5(%[dst]) \n\t"
225
226 /* odd 4. pixel */
227 "sb %[tp2], 1(%[dst]) \n\t"
228 "sb %[tp1], 6(%[dst]) \n\t"
229 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
230 "extp %[Temp1], $ac2, 31 \n\t"
231
232 "lbu %[tp1], 7(%[dst]) \n\t"
233
234 /* clamp */
235 "lbux %[p4], %[Temp3](%[cm]) \n\t"
236 "addqh_r.w %[tp3], %[tp3], %[p4] \n\t"
237
238 "lbux %[p2], %[Temp2](%[cm]) \n\t"
239 "addqh_r.w %[tp4], %[tp4], %[p2] \n\t"
240
241 "lbux %[p1], %[Temp1](%[cm]) \n\t"
242 "addqh_r.w %[tp1], %[tp1], %[p1] \n\t"
243
244 /* store bytes */
245 "sb %[tp3], 3(%[dst]) \n\t"
246 "sb %[tp4], 5(%[dst]) \n\t"
247 "sb %[tp1], 7(%[dst]) \n\t"
248
249 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
250 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
251 [st0] "=&r" (st0), [st1] "=&r" (st1),
252 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
253 [n1] "=&r" (n1),
254 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
255 : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
256 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
257 );
258
259 /* Next row... */
260 src += src_stride;
261 dst += dst_stride;
262 }
263 }
264
convolve_bi_avg_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)265 static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
266 int32_t src_stride,
267 uint8_t *dst_ptr,
268 int32_t dst_stride,
269 const int16_t *filter_x0,
270 int32_t h,
271 int32_t count) {
272 int32_t y, c;
273 const uint8_t *src;
274 uint8_t *dst;
275 uint8_t *cm = vpx_ff_cropTbl;
276 uint32_t vector_64 = 64;
277 int32_t Temp1, Temp2, Temp3;
278 uint32_t qload1, qload2, qload3;
279 uint32_t p1, p2, p3, p4, p5;
280 uint32_t st1, st2, st3;
281 const int16_t *filter = &filter_x0[3];
282 uint32_t filter45;;
283
284 filter45 = ((const int32_t *)filter)[0];
285
286 for (y = h; y--;) {
287 src = src_ptr;
288 dst = dst_ptr;
289
290 /* prefetch data to cache memory */
291 prefetch_load(src_ptr + src_stride);
292 prefetch_load(src_ptr + src_stride + 32);
293 prefetch_store(dst_ptr + dst_stride);
294
295 for (c = 0; c < count; c++) {
296 __asm__ __volatile__ (
297 "ulw %[qload1], 0(%[src]) \n\t"
298 "ulw %[qload2], 4(%[src]) \n\t"
299
300 /* even 1. pixel */
301 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
302 "mthi $zero, $ac1 \n\t"
303 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
304 "mthi $zero, $ac2 \n\t"
305 "preceu.ph.qbr %[p1], %[qload1] \n\t"
306 "preceu.ph.qbl %[p2], %[qload1] \n\t"
307 "preceu.ph.qbr %[p3], %[qload2] \n\t"
308 "preceu.ph.qbl %[p4], %[qload2] \n\t"
309 "ulw %[qload3], 8(%[src]) \n\t"
310 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
311 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
312 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
313
314 /* even 2. pixel */
315 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
316 "mthi $zero, $ac3 \n\t"
317 "preceu.ph.qbr %[p1], %[qload3] \n\t"
318 "preceu.ph.qbl %[p5], %[qload3] \n\t"
319 "ulw %[qload1], 12(%[src]) \n\t"
320 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
321 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
322 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
323
324 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
325
326 /* even 3. pixel */
327 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
328 "mthi $zero, $ac1 \n\t"
329 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
330 "preceu.ph.qbr %[p2], %[qload1] \n\t"
331 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
332 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
333 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
334 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
335
336 /* even 4. pixel */
337 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
338 "mthi $zero, $ac2 \n\t"
339 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
340 "preceu.ph.qbl %[p3], %[qload1] \n\t"
341 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
342 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
343 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
344 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
345 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
346 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
347
348 /* even 5. pixel */
349 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
350 "mthi $zero, $ac3 \n\t"
351 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
352 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
353 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
354 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
355 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
356
357 /* even 6. pixel */
358 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
359 "mthi $zero, $ac1 \n\t"
360 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
361 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
362 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
363 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
364 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
365 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
366
367 /* even 7. pixel */
368 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
369 "mthi $zero, $ac2 \n\t"
370 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
371 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
372 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
373 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
374 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
375 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
376
377 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
378
379 /* even 8. pixel */
380 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
381 "mthi $zero, $ac3 \n\t"
382 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
383 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
384 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
385 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
386 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
387
388 /* ODD pixels */
389 "ulw %[qload1], 1(%[src]) \n\t"
390 "ulw %[qload2], 5(%[src]) \n\t"
391
392 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
393
394 /* odd 1. pixel */
395 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
396 "mthi $zero, $ac1 \n\t"
397 "preceu.ph.qbr %[p1], %[qload1] \n\t"
398 "preceu.ph.qbl %[p2], %[qload1] \n\t"
399 "preceu.ph.qbr %[p3], %[qload2] \n\t"
400 "preceu.ph.qbl %[p4], %[qload2] \n\t"
401 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
402 "ulw %[qload3], 9(%[src]) \n\t"
403 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
404 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
405 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
406 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
407
408 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
409
410 /* odd 2. pixel */
411 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
412 "mthi $zero, $ac2 \n\t"
413 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
414 "preceu.ph.qbr %[p1], %[qload3] \n\t"
415 "preceu.ph.qbl %[p5], %[qload3] \n\t"
416 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
417 "ulw %[qload1], 13(%[src]) \n\t"
418 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
419 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
420 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
421 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
422
423 /* odd 3. pixel */
424 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
425 "mthi $zero, $ac3 \n\t"
426 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
427 "preceu.ph.qbr %[p2], %[qload1] \n\t"
428 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
429 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
430 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
431 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
432
433 /* odd 4. pixel */
434 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
435 "mthi $zero, $ac1 \n\t"
436 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
437 "preceu.ph.qbl %[p3], %[qload1] \n\t"
438 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
439 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
440 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
441 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
442 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
443
444 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
445
446 /* odd 5. pixel */
447 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
448 "mthi $zero, $ac2 \n\t"
449 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
450 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
451 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
452 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
453 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
454
455 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
456
457 /* odd 6. pixel */
458 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
459 "mthi $zero, $ac3 \n\t"
460 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
461 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
462 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
463 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
464 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
465
466 /* odd 7. pixel */
467 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
468 "mthi $zero, $ac1 \n\t"
469 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
470 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
471 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
472 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
473 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
474
475 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
476
477 /* odd 8. pixel */
478 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
479 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
480
481 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
482
483 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
484 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
485
486 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
487 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
488
489 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
490 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
491
492 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
493 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
494 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
495
496 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
497 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
498 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
499 [qload3] "=&r" (qload3), [p5] "=&r" (p5),
500 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
501 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
502 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
503 );
504
505 src += 16;
506 dst += 16;
507 }
508
509 /* Next row... */
510 src_ptr += src_stride;
511 dst_ptr += dst_stride;
512 }
513 }
514
convolve_bi_avg_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)515 static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
516 int32_t src_stride,
517 uint8_t *dst_ptr,
518 int32_t dst_stride,
519 const int16_t *filter_x0,
520 int32_t h) {
521 int32_t y, c;
522 const uint8_t *src;
523 uint8_t *dst;
524 uint8_t *cm = vpx_ff_cropTbl;
525 uint32_t vector_64 = 64;
526 int32_t Temp1, Temp2, Temp3;
527 uint32_t qload1, qload2, qload3;
528 uint32_t p1, p2, p3, p4, p5;
529 uint32_t st1, st2, st3;
530 const int16_t *filter = &filter_x0[3];
531 uint32_t filter45;;
532
533 filter45 = ((const int32_t *)filter)[0];
534
535 for (y = h; y--;) {
536 src = src_ptr;
537 dst = dst_ptr;
538
539 /* prefetch data to cache memory */
540 prefetch_load(src_ptr + src_stride);
541 prefetch_load(src_ptr + src_stride + 32);
542 prefetch_load(src_ptr + src_stride + 64);
543 prefetch_store(dst_ptr + dst_stride);
544 prefetch_store(dst_ptr + dst_stride + 32);
545
546 for (c = 0; c < 4; c++) {
547 __asm__ __volatile__ (
548 "ulw %[qload1], 0(%[src]) \n\t"
549 "ulw %[qload2], 4(%[src]) \n\t"
550
551 /* even 1. pixel */
552 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
553 "mthi $zero, $ac1 \n\t"
554 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
555 "mthi $zero, $ac2 \n\t"
556 "preceu.ph.qbr %[p1], %[qload1] \n\t"
557 "preceu.ph.qbl %[p2], %[qload1] \n\t"
558 "preceu.ph.qbr %[p3], %[qload2] \n\t"
559 "preceu.ph.qbl %[p4], %[qload2] \n\t"
560 "ulw %[qload3], 8(%[src]) \n\t"
561 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
562 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
563 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
564
565 /* even 2. pixel */
566 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
567 "mthi $zero, $ac3 \n\t"
568 "preceu.ph.qbr %[p1], %[qload3] \n\t"
569 "preceu.ph.qbl %[p5], %[qload3] \n\t"
570 "ulw %[qload1], 12(%[src]) \n\t"
571 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
572 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
573 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
574
575 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
576
577 /* even 3. pixel */
578 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
579 "mthi $zero, $ac1 \n\t"
580 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
581 "preceu.ph.qbr %[p2], %[qload1] \n\t"
582 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
583 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
584 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
585 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
586
587 /* even 4. pixel */
588 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
589 "mthi $zero, $ac2 \n\t"
590 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
591 "preceu.ph.qbl %[p3], %[qload1] \n\t"
592 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
593 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
594 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
595 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
596 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
597 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
598
599 /* even 5. pixel */
600 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
601 "mthi $zero, $ac3 \n\t"
602 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
603 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
604 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
605 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
606 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
607
608 /* even 6. pixel */
609 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
610 "mthi $zero, $ac1 \n\t"
611 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
612 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
613 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
614 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
615 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
616 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
617
618 /* even 7. pixel */
619 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
620 "mthi $zero, $ac2 \n\t"
621 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
622 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
623 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
624 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
625 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
626 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
627
628 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
629
630 /* even 8. pixel */
631 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
632 "mthi $zero, $ac3 \n\t"
633 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
634 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
635 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
636 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
637 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
638
639 /* ODD pixels */
640 "ulw %[qload1], 1(%[src]) \n\t"
641 "ulw %[qload2], 5(%[src]) \n\t"
642
643 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
644
645 /* odd 1. pixel */
646 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
647 "mthi $zero, $ac1 \n\t"
648 "preceu.ph.qbr %[p1], %[qload1] \n\t"
649 "preceu.ph.qbl %[p2], %[qload1] \n\t"
650 "preceu.ph.qbr %[p3], %[qload2] \n\t"
651 "preceu.ph.qbl %[p4], %[qload2] \n\t"
652 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
653 "ulw %[qload3], 9(%[src]) \n\t"
654 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
655 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
656 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
657 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
658
659 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
660
661 /* odd 2. pixel */
662 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
663 "mthi $zero, $ac2 \n\t"
664 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
665 "preceu.ph.qbr %[p1], %[qload3] \n\t"
666 "preceu.ph.qbl %[p5], %[qload3] \n\t"
667 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
668 "ulw %[qload1], 13(%[src]) \n\t"
669 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
670 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
671 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
672 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
673
674 /* odd 3. pixel */
675 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
676 "mthi $zero, $ac3 \n\t"
677 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
678 "preceu.ph.qbr %[p2], %[qload1] \n\t"
679 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
680 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
681 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
682 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
683
684 /* odd 4. pixel */
685 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
686 "mthi $zero, $ac1 \n\t"
687 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
688 "preceu.ph.qbl %[p3], %[qload1] \n\t"
689 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
690 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
691 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
692 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
693 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
694
695 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
696
697 /* odd 5. pixel */
698 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
699 "mthi $zero, $ac2 \n\t"
700 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
701 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
702 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
703 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
704 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
705
706 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
707
708 /* odd 6. pixel */
709 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
710 "mthi $zero, $ac3 \n\t"
711 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
712 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
713 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
714 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
715 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
716
717 /* odd 7. pixel */
718 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
719 "mthi $zero, $ac1 \n\t"
720 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
721 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
722 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
723 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
724 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
725
726 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
727
728 /* odd 8. pixel */
729 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
730 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
731
732 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
733
734 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
735 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
736
737 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
738 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
739
740 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
741 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
742
743 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
744 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
745 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
746
747 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
748 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
749 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
750 [qload3] "=&r" (qload3), [p5] "=&r" (p5),
751 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
752 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
753 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
754 );
755
756 src += 16;
757 dst += 16;
758 }
759
760 /* Next row... */
761 src_ptr += src_stride;
762 dst_ptr += dst_stride;
763 }
764 }
765
vpx_convolve2_avg_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)766 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
767 uint8_t *dst, ptrdiff_t dst_stride,
768 const int16_t *filter_x, int x_step_q4,
769 const int16_t *filter_y, int y_step_q4,
770 int w, int h) {
771 uint32_t pos = 38;
772
773 assert(x_step_q4 == 16);
774
775 /* bit positon for extract from acc */
776 __asm__ __volatile__ (
777 "wrdsp %[pos], 1 \n\t"
778 :
779 : [pos] "r" (pos)
780 );
781
782 /* prefetch data to cache memory */
783 prefetch_load(src);
784 prefetch_load(src + 32);
785 prefetch_store(dst);
786
787 switch (w) {
788 case 4:
789 convolve_bi_avg_horiz_4_dspr2(src, src_stride,
790 dst, dst_stride,
791 filter_x, h);
792 break;
793 case 8:
794 convolve_bi_avg_horiz_8_dspr2(src, src_stride,
795 dst, dst_stride,
796 filter_x, h);
797 break;
798 case 16:
799 convolve_bi_avg_horiz_16_dspr2(src, src_stride,
800 dst, dst_stride,
801 filter_x, h, 1);
802 break;
803 case 32:
804 convolve_bi_avg_horiz_16_dspr2(src, src_stride,
805 dst, dst_stride,
806 filter_x, h, 2);
807 break;
808 case 64:
809 prefetch_load(src + 64);
810 prefetch_store(dst + 32);
811
812 convolve_bi_avg_horiz_64_dspr2(src, src_stride,
813 dst, dst_stride,
814 filter_x, h);
815 break;
816 default:
817 vpx_convolve8_avg_horiz_c(src, src_stride,
818 dst, dst_stride,
819 filter_x, x_step_q4,
820 filter_y, y_step_q4,
821 w, h);
822 break;
823 }
824 }
825 #endif
826