1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_dsp/vpx_filter.h"
19 #include "vpx_ports/mem.h"
20
21 #if HAVE_DSPR2
convolve_avg_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)22 static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
23 uint8_t *dst, int32_t dst_stride,
24 const int16_t *filter_x0, int32_t h) {
25 int32_t y;
26 uint8_t *cm = vpx_ff_cropTbl;
27 int32_t vector1b, vector2b, vector3b, vector4b;
28 int32_t Temp1, Temp2, Temp3, Temp4;
29 uint32_t vector4a = 64;
30 uint32_t tp1, tp2;
31 uint32_t p1, p2, p3, p4;
32 uint32_t n1, n2, n3, n4;
33 uint32_t tn1, tn2;
34
35 vector1b = ((const int32_t *)filter_x0)[0];
36 vector2b = ((const int32_t *)filter_x0)[1];
37 vector3b = ((const int32_t *)filter_x0)[2];
38 vector4b = ((const int32_t *)filter_x0)[3];
39
40 for (y = h; y--;) {
41 /* prefetch data to cache memory */
42 prefetch_load(src + src_stride);
43 prefetch_load(src + src_stride + 32);
44 prefetch_store(dst + dst_stride);
45
46 __asm__ __volatile__(
47 "ulw %[tp1], 0(%[src]) \n\t"
48 "ulw %[tp2], 4(%[src]) \n\t"
49
50 /* even 1. pixel */
51 "mtlo %[vector4a], $ac3 \n\t"
52 "mthi $zero, $ac3 \n\t"
53 "preceu.ph.qbr %[p1], %[tp1] \n\t"
54 "preceu.ph.qbl %[p2], %[tp1] \n\t"
55 "preceu.ph.qbr %[p3], %[tp2] \n\t"
56 "preceu.ph.qbl %[p4], %[tp2] \n\t"
57 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
58 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
59 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
60 "ulw %[tn2], 8(%[src]) \n\t"
61 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
62 "extp %[Temp1], $ac3, 31 \n\t"
63
64 /* even 2. pixel */
65 "mtlo %[vector4a], $ac2 \n\t"
66 "mthi $zero, $ac2 \n\t"
67 "preceu.ph.qbr %[p1], %[tn2] \n\t"
68 "balign %[tn1], %[tn2], 3 \n\t"
69 "balign %[tn2], %[tp2], 3 \n\t"
70 "balign %[tp2], %[tp1], 3 \n\t"
71 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
72 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
73 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
74 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
75 "extp %[Temp3], $ac2, 31 \n\t"
76
77 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
78
79 /* odd 1. pixel */
80 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
81 "mtlo %[vector4a], $ac3 \n\t"
82 "mthi $zero, $ac3 \n\t"
83 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
84 "preceu.ph.qbr %[n1], %[tp2] \n\t"
85 "preceu.ph.qbl %[n2], %[tp2] \n\t"
86 "preceu.ph.qbr %[n3], %[tn2] \n\t"
87 "preceu.ph.qbl %[n4], %[tn2] \n\t"
88 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
89 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
90 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
91 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
92 "extp %[Temp2], $ac3, 31 \n\t"
93
94 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
95
96 /* odd 2. pixel */
97 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
98 "mtlo %[vector4a], $ac2 \n\t"
99 "mthi $zero, $ac2 \n\t"
100 "preceu.ph.qbr %[n1], %[tn1] \n\t"
101 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
102 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
103 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
104 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
105 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
106 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
107 "extp %[Temp4], $ac2, 31 \n\t"
108
109 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
110 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
111
112 /* clamp */
113 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
114 "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */
115 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
116
117 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
118 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
119
120 "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */
121 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
122
123 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
124 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
125 [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
126 [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
127 [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
128 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
129 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
130 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
131 [src] "r"(src));
132
133 /* Next row... */
134 src += src_stride;
135 dst += dst_stride;
136 }
137 }
138
convolve_avg_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)139 static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
140 uint8_t *dst, int32_t dst_stride,
141 const int16_t *filter_x0, int32_t h) {
142 int32_t y;
143 uint8_t *cm = vpx_ff_cropTbl;
144 uint32_t vector4a = 64;
145 int32_t vector1b, vector2b, vector3b, vector4b;
146 int32_t Temp1, Temp2, Temp3;
147 uint32_t tp1, tp2;
148 uint32_t p1, p2, p3, p4, n1;
149 uint32_t tn1, tn2, tn3;
150 uint32_t st0, st1;
151
152 vector1b = ((const int32_t *)filter_x0)[0];
153 vector2b = ((const int32_t *)filter_x0)[1];
154 vector3b = ((const int32_t *)filter_x0)[2];
155 vector4b = ((const int32_t *)filter_x0)[3];
156
157 for (y = h; y--;) {
158 /* prefetch data to cache memory */
159 prefetch_load(src + src_stride);
160 prefetch_load(src + src_stride + 32);
161 prefetch_store(dst + dst_stride);
162
163 __asm__ __volatile__(
164 "ulw %[tp1], 0(%[src]) \n\t"
165 "ulw %[tp2], 4(%[src]) \n\t"
166
167 /* even 1. pixel */
168 "mtlo %[vector4a], $ac3 \n\t"
169 "mthi $zero, $ac3 \n\t"
170 "mtlo %[vector4a], $ac2 \n\t"
171 "mthi $zero, $ac2 \n\t"
172 "preceu.ph.qbr %[p1], %[tp1] \n\t"
173 "preceu.ph.qbl %[p2], %[tp1] \n\t"
174 "preceu.ph.qbr %[p3], %[tp2] \n\t"
175 "preceu.ph.qbl %[p4], %[tp2] \n\t"
176 "ulw %[tn2], 8(%[src]) \n\t"
177 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
178 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
179 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
180 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
181 "extp %[Temp1], $ac3, 31 \n\t"
182 "lbu %[Temp2], 0(%[dst]) \n\t"
183 "lbu %[tn3], 2(%[dst]) \n\t"
184
185 /* even 2. pixel */
186 "preceu.ph.qbr %[p1], %[tn2] \n\t"
187 "preceu.ph.qbl %[n1], %[tn2] \n\t"
188 "ulw %[tn1], 12(%[src]) \n\t"
189 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
190 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
191 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
192 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
193 "extp %[Temp3], $ac2, 31 \n\t"
194
195 /* even 3. pixel */
196 "lbux %[st0], %[Temp1](%[cm]) \n\t"
197 "mtlo %[vector4a], $ac1 \n\t"
198 "mthi $zero, $ac1 \n\t"
199 "preceu.ph.qbr %[p2], %[tn1] \n\t"
200 "lbux %[st1], %[Temp3](%[cm]) \n\t"
201 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
202 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
203 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
204 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
205 "extp %[Temp1], $ac1, 31 \n\t"
206
207 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
208 "addqh_r.w %[tn3], %[tn3], %[st1] \n\t"
209 "sb %[Temp2], 0(%[dst]) \n\t"
210 "sb %[tn3], 2(%[dst]) \n\t"
211
212 /* even 4. pixel */
213 "mtlo %[vector4a], $ac2 \n\t"
214 "mthi $zero, $ac2 \n\t"
215 "mtlo %[vector4a], $ac3 \n\t"
216 "mthi $zero, $ac3 \n\t"
217
218 "balign %[tn3], %[tn1], 3 \n\t"
219 "balign %[tn1], %[tn2], 3 \n\t"
220 "balign %[tn2], %[tp2], 3 \n\t"
221 "balign %[tp2], %[tp1], 3 \n\t"
222
223 "lbux %[st0], %[Temp1](%[cm]) \n\t"
224 "lbu %[Temp2], 4(%[dst]) \n\t"
225 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
226
227 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
228 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
229 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
230 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
231 "extp %[Temp3], $ac2, 31 \n\t"
232
233 /* odd 1. pixel */
234 "mtlo %[vector4a], $ac1 \n\t"
235 "mthi $zero, $ac1 \n\t"
236 "sb %[Temp2], 4(%[dst]) \n\t"
237 "preceu.ph.qbr %[p1], %[tp2] \n\t"
238 "preceu.ph.qbl %[p2], %[tp2] \n\t"
239 "preceu.ph.qbr %[p3], %[tn2] \n\t"
240 "preceu.ph.qbl %[p4], %[tn2] \n\t"
241 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
242 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
243 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
244 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
245 "extp %[Temp2], $ac3, 31 \n\t"
246
247 "lbu %[tp1], 6(%[dst]) \n\t"
248
249 /* odd 2. pixel */
250 "mtlo %[vector4a], $ac3 \n\t"
251 "mthi $zero, $ac3 \n\t"
252 "mtlo %[vector4a], $ac2 \n\t"
253 "mthi $zero, $ac2 \n\t"
254 "preceu.ph.qbr %[p1], %[tn1] \n\t"
255 "preceu.ph.qbl %[n1], %[tn1] \n\t"
256 "lbux %[st0], %[Temp3](%[cm]) \n\t"
257 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
258 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
259 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
260 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
261 "extp %[Temp3], $ac1, 31 \n\t"
262
263 "lbu %[tp2], 1(%[dst]) \n\t"
264 "lbu %[tn2], 3(%[dst]) \n\t"
265 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
266
267 /* odd 3. pixel */
268 "lbux %[st1], %[Temp2](%[cm]) \n\t"
269 "preceu.ph.qbr %[p2], %[tn3] \n\t"
270 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
271 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
272 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
273 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
274 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
275 "extp %[Temp2], $ac3, 31 \n\t"
276
277 "lbu %[tn3], 5(%[dst]) \n\t"
278
279 /* odd 4. pixel */
280 "sb %[tp2], 1(%[dst]) \n\t"
281 "sb %[tp1], 6(%[dst]) \n\t"
282 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
283 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
284 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
285 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
286 "extp %[Temp1], $ac2, 31 \n\t"
287
288 "lbu %[tn1], 7(%[dst]) \n\t"
289
290 /* clamp */
291 "lbux %[p4], %[Temp3](%[cm]) \n\t"
292 "addqh_r.w %[tn2], %[tn2], %[p4] \n\t"
293
294 "lbux %[p2], %[Temp2](%[cm]) \n\t"
295 "addqh_r.w %[tn3], %[tn3], %[p2] \n\t"
296
297 "lbux %[n1], %[Temp1](%[cm]) \n\t"
298 "addqh_r.w %[tn1], %[tn1], %[n1] \n\t"
299
300 /* store bytes */
301 "sb %[tn2], 3(%[dst]) \n\t"
302 "sb %[tn3], 5(%[dst]) \n\t"
303 "sb %[tn1], 7(%[dst]) \n\t"
304
305 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
306 [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
307 [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
308 [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
309 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
310 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
311 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
312 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
313 [src] "r"(src));
314
315 /* Next row... */
316 src += src_stride;
317 dst += dst_stride;
318 }
319 }
320
convolve_avg_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)321 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
322 int32_t src_stride, uint8_t *dst_ptr,
323 int32_t dst_stride,
324 const int16_t *filter_x0, int32_t h,
325 int32_t count) {
326 int32_t y, c;
327 const uint8_t *src;
328 uint8_t *dst;
329 uint8_t *cm = vpx_ff_cropTbl;
330 uint32_t vector_64 = 64;
331 int32_t filter12, filter34, filter56, filter78;
332 int32_t Temp1, Temp2, Temp3;
333 uint32_t qload1, qload2, qload3;
334 uint32_t p1, p2, p3, p4, p5;
335 uint32_t st1, st2, st3;
336
337 filter12 = ((const int32_t *)filter_x0)[0];
338 filter34 = ((const int32_t *)filter_x0)[1];
339 filter56 = ((const int32_t *)filter_x0)[2];
340 filter78 = ((const int32_t *)filter_x0)[3];
341
342 for (y = h; y--;) {
343 src = src_ptr;
344 dst = dst_ptr;
345
346 /* prefetch data to cache memory */
347 prefetch_load(src_ptr + src_stride);
348 prefetch_load(src_ptr + src_stride + 32);
349 prefetch_store(dst_ptr + dst_stride);
350
351 for (c = 0; c < count; c++) {
352 __asm__ __volatile__(
353 "ulw %[qload1], 0(%[src]) \n\t"
354 "ulw %[qload2], 4(%[src]) \n\t"
355
356 /* even 1. pixel */
357 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
358 "mthi $zero, $ac1 \n\t"
359 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
360 "mthi $zero, $ac2 \n\t"
361 "preceu.ph.qbr %[p1], %[qload1] \n\t"
362 "preceu.ph.qbl %[p2], %[qload1] \n\t"
363 "preceu.ph.qbr %[p3], %[qload2] \n\t"
364 "preceu.ph.qbl %[p4], %[qload2] \n\t"
365 "ulw %[qload3], 8(%[src]) \n\t"
366 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
367 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
368 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
369 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
370 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
371 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
372
373 /* even 2. pixel */
374 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
375 "mthi $zero, $ac3 \n\t"
376 "preceu.ph.qbr %[p1], %[qload3] \n\t"
377 "preceu.ph.qbl %[p5], %[qload3] \n\t"
378 "ulw %[qload1], 12(%[src]) \n\t"
379 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
380 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
381 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
382 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
383 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
384 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
385
386 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
387
388 /* even 3. pixel */
389 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
390 "mthi $zero, $ac1 \n\t"
391 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
392 "preceu.ph.qbr %[p2], %[qload1] \n\t"
393 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
394 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
395 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
396 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
397 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
398 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
399 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
400
401 /* even 4. pixel */
402 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
403 "mthi $zero, $ac2 \n\t"
404 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
405 "preceu.ph.qbl %[p3], %[qload1] \n\t"
406 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
407 "ulw %[qload2], 16(%[src]) \n\t"
408 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
409 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
410 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
411 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
412 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
413 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
414 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
415 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
416
417 /* even 5. pixel */
418 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
419 "mthi $zero, $ac3 \n\t"
420 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
421 "preceu.ph.qbr %[p4], %[qload2] \n\t"
422 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
423 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
424 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
425 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
426 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
427 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
428 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
429
430 /* even 6. pixel */
431 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
432 "mthi $zero, $ac1 \n\t"
433 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
434 "preceu.ph.qbl %[p1], %[qload2] \n\t"
435 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
436 "ulw %[qload3], 20(%[src]) \n\t"
437 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
438 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
439 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
440 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
441 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
442 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
443 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
444
445 /* even 7. pixel */
446 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
447 "mthi $zero, $ac2 \n\t"
448 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
449 "preceu.ph.qbr %[p5], %[qload3] \n\t"
450 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
451 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
452 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
453 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
454 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
455 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
456 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
457 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
458
459 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
460
461 /* even 8. pixel */
462 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
463 "mthi $zero, $ac3 \n\t"
464 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
465 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
466 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
467 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
468 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
469 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
470 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
471 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
472
473 /* ODD pixels */
474 "ulw %[qload1], 1(%[src]) \n\t"
475 "ulw %[qload2], 5(%[src]) \n\t"
476
477 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
478
479 /* odd 1. pixel */
480 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
481 "mthi $zero, $ac1 \n\t"
482 "preceu.ph.qbr %[p1], %[qload1] \n\t"
483 "preceu.ph.qbl %[p2], %[qload1] \n\t"
484 "preceu.ph.qbr %[p3], %[qload2] \n\t"
485 "preceu.ph.qbl %[p4], %[qload2] \n\t"
486 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
487 "ulw %[qload3], 9(%[src]) \n\t"
488 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
489 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
490 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
491 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
492 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
493 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
494 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
495
496 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
497
498 /* odd 2. pixel */
499 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
500 "mthi $zero, $ac2 \n\t"
501 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
502 "preceu.ph.qbr %[p1], %[qload3] \n\t"
503 "preceu.ph.qbl %[p5], %[qload3] \n\t"
504 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
505 "ulw %[qload1], 13(%[src]) \n\t"
506 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
507 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
508 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
509 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
510 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
511 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
512 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
513
514 /* odd 3. pixel */
515 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
516 "mthi $zero, $ac3 \n\t"
517 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
518 "preceu.ph.qbr %[p2], %[qload1] \n\t"
519 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
520 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
521 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
522 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
523 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
524 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
525 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
526
527 /* odd 4. pixel */
528 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
529 "mthi $zero, $ac1 \n\t"
530 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
531 "preceu.ph.qbl %[p3], %[qload1] \n\t"
532 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
533 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
534 "ulw %[qload2], 17(%[src]) \n\t"
535 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
536 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
537 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
538 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
539 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
540 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
541
542 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
543
544 /* odd 5. pixel */
545 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
546 "mthi $zero, $ac2 \n\t"
547 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
548 "preceu.ph.qbr %[p4], %[qload2] \n\t"
549 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
550 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
551 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
552 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
553 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
554 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
555 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
556
557 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
558
559 /* odd 6. pixel */
560 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
561 "mthi $zero, $ac3 \n\t"
562 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
563 "preceu.ph.qbl %[p1], %[qload2] \n\t"
564 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
565 "ulw %[qload3], 21(%[src]) \n\t"
566 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
567 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
568 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
569 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
570 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
571 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
572
573 /* odd 7. pixel */
574 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
575 "mthi $zero, $ac1 \n\t"
576 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
577 "preceu.ph.qbr %[p5], %[qload3] \n\t"
578 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
579 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
580 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
581 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
582 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
583 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
584 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
585
586 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
587
588 /* odd 8. pixel */
589 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
590 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
591 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
592 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
593 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
594
595 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
596
597 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
598 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
599
600 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
601 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
602
603 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
604 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
605
606 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
607 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
608 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
609
610 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
611 [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
612 [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
613 [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
614 [Temp3] "=&r"(Temp3)
615 : [filter12] "r"(filter12), [filter34] "r"(filter34),
616 [filter56] "r"(filter56), [filter78] "r"(filter78),
617 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
618 [src] "r"(src));
619
620 src += 16;
621 dst += 16;
622 }
623
624 /* Next row... */
625 src_ptr += src_stride;
626 dst_ptr += dst_stride;
627 }
628 }
629
convolve_avg_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)630 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
631 int32_t src_stride, uint8_t *dst_ptr,
632 int32_t dst_stride,
633 const int16_t *filter_x0, int32_t h) {
634 int32_t y, c;
635 const uint8_t *src;
636 uint8_t *dst;
637 uint8_t *cm = vpx_ff_cropTbl;
638 uint32_t vector_64 = 64;
639 int32_t filter12, filter34, filter56, filter78;
640 int32_t Temp1, Temp2, Temp3;
641 uint32_t qload1, qload2, qload3;
642 uint32_t p1, p2, p3, p4, p5;
643 uint32_t st1, st2, st3;
644
645 filter12 = ((const int32_t *)filter_x0)[0];
646 filter34 = ((const int32_t *)filter_x0)[1];
647 filter56 = ((const int32_t *)filter_x0)[2];
648 filter78 = ((const int32_t *)filter_x0)[3];
649
650 for (y = h; y--;) {
651 src = src_ptr;
652 dst = dst_ptr;
653
654 /* prefetch data to cache memory */
655 prefetch_load(src_ptr + src_stride);
656 prefetch_load(src_ptr + src_stride + 32);
657 prefetch_load(src_ptr + src_stride + 64);
658 prefetch_store(dst_ptr + dst_stride);
659 prefetch_store(dst_ptr + dst_stride + 32);
660
661 for (c = 0; c < 4; c++) {
662 __asm__ __volatile__(
663 "ulw %[qload1], 0(%[src]) \n\t"
664 "ulw %[qload2], 4(%[src]) \n\t"
665
666 /* even 1. pixel */
667 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
668 "mthi $zero, $ac1 \n\t"
669 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
670 "mthi $zero, $ac2 \n\t"
671 "preceu.ph.qbr %[p1], %[qload1] \n\t"
672 "preceu.ph.qbl %[p2], %[qload1] \n\t"
673 "preceu.ph.qbr %[p3], %[qload2] \n\t"
674 "preceu.ph.qbl %[p4], %[qload2] \n\t"
675 "ulw %[qload3], 8(%[src]) \n\t"
676 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
677 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
678 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
679 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
680 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
681 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
682
683 /* even 2. pixel */
684 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
685 "mthi $zero, $ac3 \n\t"
686 "preceu.ph.qbr %[p1], %[qload3] \n\t"
687 "preceu.ph.qbl %[p5], %[qload3] \n\t"
688 "ulw %[qload1], 12(%[src]) \n\t"
689 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
690 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
691 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
692 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
693 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
694 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
695
696 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
697
698 /* even 3. pixel */
699 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
700 "mthi $zero, $ac1 \n\t"
701 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
702 "preceu.ph.qbr %[p2], %[qload1] \n\t"
703 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
704 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
705 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
706 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
707 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
708 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
709 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
710
711 /* even 4. pixel */
712 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
713 "mthi $zero, $ac2 \n\t"
714 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
715 "preceu.ph.qbl %[p3], %[qload1] \n\t"
716 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
717 "ulw %[qload2], 16(%[src]) \n\t"
718 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
719 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
720 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
721 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
722 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
723 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
724 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
725 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
726
727 /* even 5. pixel */
728 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
729 "mthi $zero, $ac3 \n\t"
730 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
731 "preceu.ph.qbr %[p4], %[qload2] \n\t"
732 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
733 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
734 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
735 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
736 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
737 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
738 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
739
740 /* even 6. pixel */
741 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
742 "mthi $zero, $ac1 \n\t"
743 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
744 "preceu.ph.qbl %[p1], %[qload2] \n\t"
745 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
746 "ulw %[qload3], 20(%[src]) \n\t"
747 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
748 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
749 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
750 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
751 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
752 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
753 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
754
755 /* even 7. pixel */
756 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
757 "mthi $zero, $ac2 \n\t"
758 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
759 "preceu.ph.qbr %[p5], %[qload3] \n\t"
760 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
761 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
762 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
763 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
764 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
765 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
766 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
767 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
768
769 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
770
771 /* even 8. pixel */
772 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
773 "mthi $zero, $ac3 \n\t"
774 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
775 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
776 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
777 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
778 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
779 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
780 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
781 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
782
783 /* ODD pixels */
784 "ulw %[qload1], 1(%[src]) \n\t"
785 "ulw %[qload2], 5(%[src]) \n\t"
786
787 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
788
789 /* odd 1. pixel */
790 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
791 "mthi $zero, $ac1 \n\t"
792 "preceu.ph.qbr %[p1], %[qload1] \n\t"
793 "preceu.ph.qbl %[p2], %[qload1] \n\t"
794 "preceu.ph.qbr %[p3], %[qload2] \n\t"
795 "preceu.ph.qbl %[p4], %[qload2] \n\t"
796 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
797 "ulw %[qload3], 9(%[src]) \n\t"
798 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
799 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
800 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
801 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
802 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
803 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
804 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
805
806 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
807
808 /* odd 2. pixel */
809 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
810 "mthi $zero, $ac2 \n\t"
811 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
812 "preceu.ph.qbr %[p1], %[qload3] \n\t"
813 "preceu.ph.qbl %[p5], %[qload3] \n\t"
814 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
815 "ulw %[qload1], 13(%[src]) \n\t"
816 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
817 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
818 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
819 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
820 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
821 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
822 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
823
824 /* odd 3. pixel */
825 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
826 "mthi $zero, $ac3 \n\t"
827 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
828 "preceu.ph.qbr %[p2], %[qload1] \n\t"
829 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
830 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
831 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
832 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
833 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
834 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
835 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
836
837 /* odd 4. pixel */
838 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
839 "mthi $zero, $ac1 \n\t"
840 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
841 "preceu.ph.qbl %[p3], %[qload1] \n\t"
842 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
843 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
844 "ulw %[qload2], 17(%[src]) \n\t"
845 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
846 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
847 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
848 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
849 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
850 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
851
852 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
853
854 /* odd 5. pixel */
855 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
856 "mthi $zero, $ac2 \n\t"
857 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
858 "preceu.ph.qbr %[p4], %[qload2] \n\t"
859 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
860 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
861 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
862 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
863 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
864 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
865 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
866
867 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
868
869 /* odd 6. pixel */
870 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
871 "mthi $zero, $ac3 \n\t"
872 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
873 "preceu.ph.qbl %[p1], %[qload2] \n\t"
874 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
875 "ulw %[qload3], 21(%[src]) \n\t"
876 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
877 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
878 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
879 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
880 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
881 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
882
883 /* odd 7. pixel */
884 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
885 "mthi $zero, $ac1 \n\t"
886 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
887 "preceu.ph.qbr %[p5], %[qload3] \n\t"
888 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
889 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
890 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
891 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
892 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
893 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
894 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
895
896 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
897
898 /* odd 8. pixel */
899 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
900 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
901 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
902 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
903 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
904
905 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
906
907 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
908 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
909
910 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
911 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
912
913 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
914 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
915
916 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
917 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
918 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
919
920 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
921 [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
922 [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
923 [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
924 [Temp3] "=&r"(Temp3)
925 : [filter12] "r"(filter12), [filter34] "r"(filter34),
926 [filter56] "r"(filter56), [filter78] "r"(filter78),
927 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
928 [src] "r"(src));
929
930 src += 16;
931 dst += 16;
932 }
933
934 /* Next row... */
935 src_ptr += src_stride;
936 dst_ptr += dst_stride;
937 }
938 }
939
vpx_convolve8_avg_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)940 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
941 uint8_t *dst, ptrdiff_t dst_stride,
942 const InterpKernel *filter, int x0_q4,
943 int32_t x_step_q4, int y0_q4, int y_step_q4,
944 int w, int h) {
945 const int16_t *const filter_x = filter[x0_q4];
946 assert(x_step_q4 == 16);
947 assert(((const int32_t *)filter_x)[1] != 0x800000);
948
949 if (vpx_get_filter_taps(filter_x) == 2) {
950 vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,
951 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
952 } else {
953 uint32_t pos = 38;
954
955 src -= 3;
956
957 /* bit positon for extract from acc */
958 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
959 :
960 : [pos] "r"(pos));
961
962 /* prefetch data to cache memory */
963 prefetch_load(src);
964 prefetch_load(src + 32);
965 prefetch_store(dst);
966
967 switch (w) {
968 case 4:
969 convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
970 h);
971 break;
972 case 8:
973 convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
974 h);
975 break;
976 case 16:
977 convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
978 h, 1);
979 break;
980 case 32:
981 convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
982 h, 2);
983 break;
984 case 64:
985 prefetch_load(src + 64);
986 prefetch_store(dst + 32);
987
988 convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
989 h);
990 break;
991 default:
992 vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
993 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
994 break;
995 }
996 }
997 }
998 #endif
999