1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
22 uint8_t *dst, int32_t dst_stride,
23 const int16_t *filter_x0, int32_t h) {
24 int32_t y;
25 uint8_t *cm = vpx_ff_cropTbl;
26 int32_t vector1b, vector2b, vector3b, vector4b;
27 int32_t Temp1, Temp2, Temp3, Temp4;
28 uint32_t vector4a = 64;
29 uint32_t tp1, tp2;
30 uint32_t p1, p2, p3, p4;
31 uint32_t n1, n2, n3, n4;
32 uint32_t tn1, tn2;
33
34 vector1b = ((const int32_t *)filter_x0)[0];
35 vector2b = ((const int32_t *)filter_x0)[1];
36 vector3b = ((const int32_t *)filter_x0)[2];
37 vector4b = ((const int32_t *)filter_x0)[3];
38
39 for (y = h; y--;) {
40 /* prefetch data to cache memory */
41 prefetch_load(src + src_stride);
42 prefetch_load(src + src_stride + 32);
43 prefetch_store(dst + dst_stride);
44
45 __asm__ __volatile__(
46 "ulw %[tp1], 0(%[src]) \n\t"
47 "ulw %[tp2], 4(%[src]) \n\t"
48
49 /* even 1. pixel */
50 "mtlo %[vector4a], $ac3 \n\t"
51 "mthi $zero, $ac3 \n\t"
52 "preceu.ph.qbr %[p1], %[tp1] \n\t"
53 "preceu.ph.qbl %[p2], %[tp1] \n\t"
54 "preceu.ph.qbr %[p3], %[tp2] \n\t"
55 "preceu.ph.qbl %[p4], %[tp2] \n\t"
56 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
57 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
58 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
59 "ulw %[tn2], 8(%[src]) \n\t"
60 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
61 "extp %[Temp1], $ac3, 31 \n\t"
62
63 /* even 2. pixel */
64 "mtlo %[vector4a], $ac2 \n\t"
65 "mthi $zero, $ac2 \n\t"
66 "preceu.ph.qbr %[p1], %[tn2] \n\t"
67 "balign %[tn1], %[tn2], 3 \n\t"
68 "balign %[tn2], %[tp2], 3 \n\t"
69 "balign %[tp2], %[tp1], 3 \n\t"
70 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
71 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
72 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
73 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
74 "extp %[Temp3], $ac2, 31 \n\t"
75
76 /* odd 1. pixel */
77 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
78 "mtlo %[vector4a], $ac3 \n\t"
79 "mthi $zero, $ac3 \n\t"
80 "preceu.ph.qbr %[n1], %[tp2] \n\t"
81 "preceu.ph.qbl %[n2], %[tp2] \n\t"
82 "preceu.ph.qbr %[n3], %[tn2] \n\t"
83 "preceu.ph.qbl %[n4], %[tn2] \n\t"
84 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
85 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
86 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
87 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
88 "extp %[Temp2], $ac3, 31 \n\t"
89
90 /* odd 2. pixel */
91 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
92 "mtlo %[vector4a], $ac2 \n\t"
93 "mthi $zero, $ac2 \n\t"
94 "preceu.ph.qbr %[n1], %[tn1] \n\t"
95 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
96 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
97 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
98 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
99 "extp %[Temp4], $ac2, 31 \n\t"
100
101 /* clamp */
102 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
103 "lbux %[n2], %[Temp4](%[cm]) \n\t"
104
105 /* store bytes */
106 "sb %[tp1], 0(%[dst]) \n\t"
107 "sb %[tn1], 1(%[dst]) \n\t"
108 "sb %[tp2], 2(%[dst]) \n\t"
109 "sb %[n2], 3(%[dst]) \n\t"
110
111 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
112 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
113 [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
114 [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
115 [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
116 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
117 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
118 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
119 [src] "r"(src));
120
121 /* Next row... */
122 src += src_stride;
123 dst += dst_stride;
124 }
125 }
126
convolve_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)127 static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
128 uint8_t *dst, int32_t dst_stride,
129 const int16_t *filter_x0, int32_t h) {
130 int32_t y;
131 uint8_t *cm = vpx_ff_cropTbl;
132 uint32_t vector4a = 64;
133 int32_t vector1b, vector2b, vector3b, vector4b;
134 int32_t Temp1, Temp2, Temp3;
135 uint32_t tp1, tp2;
136 uint32_t p1, p2, p3, p4, n1;
137 uint32_t tn1, tn2, tn3;
138 uint32_t st0, st1;
139
140 vector1b = ((const int32_t *)filter_x0)[0];
141 vector2b = ((const int32_t *)filter_x0)[1];
142 vector3b = ((const int32_t *)filter_x0)[2];
143 vector4b = ((const int32_t *)filter_x0)[3];
144
145 for (y = h; y--;) {
146 /* prefetch data to cache memory */
147 prefetch_load(src + src_stride);
148 prefetch_load(src + src_stride + 32);
149 prefetch_store(dst + dst_stride);
150
151 __asm__ __volatile__(
152 "ulw %[tp1], 0(%[src]) \n\t"
153 "ulw %[tp2], 4(%[src]) \n\t"
154
155 /* even 1. pixel */
156 "mtlo %[vector4a], $ac3 \n\t"
157 "mthi $zero, $ac3 \n\t"
158 "mtlo %[vector4a], $ac2 \n\t"
159 "mthi $zero, $ac2 \n\t"
160 "preceu.ph.qbr %[p1], %[tp1] \n\t"
161 "preceu.ph.qbl %[p2], %[tp1] \n\t"
162 "preceu.ph.qbr %[p3], %[tp2] \n\t"
163 "preceu.ph.qbl %[p4], %[tp2] \n\t"
164 "ulw %[tn2], 8(%[src]) \n\t"
165 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
166 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
167 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
168 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
169 "extp %[Temp1], $ac3, 31 \n\t"
170
171 /* even 2. pixel */
172 "preceu.ph.qbr %[p1], %[tn2] \n\t"
173 "preceu.ph.qbl %[n1], %[tn2] \n\t"
174 "ulw %[tn1], 12(%[src]) \n\t"
175 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
176 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
177 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
178 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
179 "extp %[Temp3], $ac2, 31 \n\t"
180
181 /* even 3. pixel */
182 "lbux %[st0], %[Temp1](%[cm]) \n\t"
183 "mtlo %[vector4a], $ac1 \n\t"
184 "mthi $zero, $ac1 \n\t"
185 "preceu.ph.qbr %[p2], %[tn1] \n\t"
186 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
187 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
188 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
189 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
190 "extp %[Temp1], $ac1, 31 \n\t"
191
192 /* even 4. pixel */
193 "mtlo %[vector4a], $ac2 \n\t"
194 "mthi $zero, $ac2 \n\t"
195 "mtlo %[vector4a], $ac3 \n\t"
196 "mthi $zero, $ac3 \n\t"
197 "sb %[st0], 0(%[dst]) \n\t"
198 "lbux %[st1], %[Temp3](%[cm]) \n\t"
199
200 "balign %[tn3], %[tn1], 3 \n\t"
201 "balign %[tn1], %[tn2], 3 \n\t"
202 "balign %[tn2], %[tp2], 3 \n\t"
203 "balign %[tp2], %[tp1], 3 \n\t"
204
205 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
206 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
207 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
208 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
209 "extp %[Temp3], $ac2, 31 \n\t"
210
211 "lbux %[st0], %[Temp1](%[cm]) \n\t"
212
213 /* odd 1. pixel */
214 "mtlo %[vector4a], $ac1 \n\t"
215 "mthi $zero, $ac1 \n\t"
216 "sb %[st1], 2(%[dst]) \n\t"
217 "preceu.ph.qbr %[p1], %[tp2] \n\t"
218 "preceu.ph.qbl %[p2], %[tp2] \n\t"
219 "preceu.ph.qbr %[p3], %[tn2] \n\t"
220 "preceu.ph.qbl %[p4], %[tn2] \n\t"
221 "sb %[st0], 4(%[dst]) \n\t"
222 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
223 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
224 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
225 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
226 "extp %[Temp2], $ac3, 31 \n\t"
227
228 /* odd 2. pixel */
229 "mtlo %[vector4a], $ac3 \n\t"
230 "mthi $zero, $ac3 \n\t"
231 "mtlo %[vector4a], $ac2 \n\t"
232 "mthi $zero, $ac2 \n\t"
233 "preceu.ph.qbr %[p1], %[tn1] \n\t"
234 "preceu.ph.qbl %[n1], %[tn1] \n\t"
235 "lbux %[st0], %[Temp3](%[cm]) \n\t"
236 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
237 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
238 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
239 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
240 "extp %[Temp3], $ac1, 31 \n\t"
241
242 /* odd 3. pixel */
243 "lbux %[st1], %[Temp2](%[cm]) \n\t"
244 "preceu.ph.qbr %[p2], %[tn3] \n\t"
245 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
246 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
247 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
248 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
249 "extp %[Temp2], $ac3, 31 \n\t"
250
251 /* odd 4. pixel */
252 "sb %[st1], 1(%[dst]) \n\t"
253 "sb %[st0], 6(%[dst]) \n\t"
254 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
255 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
256 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
257 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
258 "extp %[Temp1], $ac2, 31 \n\t"
259
260 /* clamp */
261 "lbux %[p4], %[Temp3](%[cm]) \n\t"
262 "lbux %[p2], %[Temp2](%[cm]) \n\t"
263 "lbux %[n1], %[Temp1](%[cm]) \n\t"
264
265 /* store bytes */
266 "sb %[p4], 3(%[dst]) \n\t"
267 "sb %[p2], 5(%[dst]) \n\t"
268 "sb %[n1], 7(%[dst]) \n\t"
269
270 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
271 [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
272 [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
273 [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
274 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
275 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
276 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
277 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
278 [src] "r"(src));
279
280 /* Next row... */
281 src += src_stride;
282 dst += dst_stride;
283 }
284 }
285
convolve_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)286 static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
287 uint8_t *dst_ptr, int32_t dst_stride,
288 const int16_t *filter_x0, int32_t h,
289 int32_t count) {
290 int32_t y, c;
291 const uint8_t *src;
292 uint8_t *dst;
293 uint8_t *cm = vpx_ff_cropTbl;
294 uint32_t vector_64 = 64;
295 int32_t filter12, filter34, filter56, filter78;
296 int32_t Temp1, Temp2, Temp3;
297 uint32_t qload1, qload2, qload3;
298 uint32_t p1, p2, p3, p4, p5;
299 uint32_t st1, st2, st3;
300
301 filter12 = ((const int32_t *)filter_x0)[0];
302 filter34 = ((const int32_t *)filter_x0)[1];
303 filter56 = ((const int32_t *)filter_x0)[2];
304 filter78 = ((const int32_t *)filter_x0)[3];
305
306 for (y = h; y--;) {
307 src = src_ptr;
308 dst = dst_ptr;
309
310 /* prefetch data to cache memory */
311 prefetch_load(src_ptr + src_stride);
312 prefetch_load(src_ptr + src_stride + 32);
313 prefetch_store(dst_ptr + dst_stride);
314
315 for (c = 0; c < count; c++) {
316 __asm__ __volatile__(
317 "ulw %[qload1], 0(%[src]) \n\t"
318 "ulw %[qload2], 4(%[src]) \n\t"
319
320 /* even 1. pixel */
321 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
322 "mthi $zero, $ac1 \n\t"
323 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
324 "mthi $zero, $ac2 \n\t"
325 "preceu.ph.qbr %[p1], %[qload1] \n\t"
326 "preceu.ph.qbl %[p2], %[qload1] \n\t"
327 "preceu.ph.qbr %[p3], %[qload2] \n\t"
328 "preceu.ph.qbl %[p4], %[qload2] \n\t"
329 "ulw %[qload3], 8(%[src]) \n\t"
330 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
331 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
332 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
333 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
334 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
335
336 /* even 2. pixel */
337 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
338 "mthi $zero, $ac3 \n\t"
339 "preceu.ph.qbr %[p1], %[qload3] \n\t"
340 "preceu.ph.qbl %[p5], %[qload3] \n\t"
341 "ulw %[qload1], 12(%[src]) \n\t"
342 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
343 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
344 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
345 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
346 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
347 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
348
349 /* even 3. pixel */
350 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
351 "mthi $zero, $ac1 \n\t"
352 "preceu.ph.qbr %[p2], %[qload1] \n\t"
353 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
354 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
355 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
356 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
357 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
358 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
359 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
360
361 /* even 4. pixel */
362 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
363 "mthi $zero, $ac2 \n\t"
364 "preceu.ph.qbl %[p3], %[qload1] \n\t"
365 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
366 "ulw %[qload2], 16(%[src]) \n\t"
367 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
368 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
369 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
370 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
371 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
372 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
373
374 /* even 5. pixel */
375 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
376 "mthi $zero, $ac3 \n\t"
377 "preceu.ph.qbr %[p4], %[qload2] \n\t"
378 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
379 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
380 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
381 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
382 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
383 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
384 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
385
386 /* even 6. pixel */
387 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
388 "mthi $zero, $ac1 \n\t"
389 "preceu.ph.qbl %[p1], %[qload2] \n\t"
390 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
391 "ulw %[qload3], 20(%[src]) \n\t"
392 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
393 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
394 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
395 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
396 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
397 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
398
399 /* even 7. pixel */
400 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
401 "mthi $zero, $ac2 \n\t"
402 "preceu.ph.qbr %[p5], %[qload3] \n\t"
403 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
404 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
405 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
406 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
407 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
408 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
409 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
410
411 /* even 8. pixel */
412 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
413 "mthi $zero, $ac3 \n\t"
414 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
415 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
416 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
417 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
418 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
419 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
420 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
421
422 /* ODD pixels */
423 "ulw %[qload1], 1(%[src]) \n\t"
424 "ulw %[qload2], 5(%[src]) \n\t"
425
426 /* odd 1. pixel */
427 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
428 "mthi $zero, $ac1 \n\t"
429 "preceu.ph.qbr %[p1], %[qload1] \n\t"
430 "preceu.ph.qbl %[p2], %[qload1] \n\t"
431 "preceu.ph.qbr %[p3], %[qload2] \n\t"
432 "preceu.ph.qbl %[p4], %[qload2] \n\t"
433 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
434 "ulw %[qload3], 9(%[src]) \n\t"
435 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
436 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
437 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
438 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
439 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
440 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
441
442 /* odd 2. pixel */
443 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
444 "mthi $zero, $ac2 \n\t"
445 "preceu.ph.qbr %[p1], %[qload3] \n\t"
446 "preceu.ph.qbl %[p5], %[qload3] \n\t"
447 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
448 "ulw %[qload1], 13(%[src]) \n\t"
449 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
450 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
451 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
452 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
453 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
454 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
455
456 /* odd 3. pixel */
457 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
458 "mthi $zero, $ac3 \n\t"
459 "preceu.ph.qbr %[p2], %[qload1] \n\t"
460 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
461 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
462 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
463 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
464 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
465 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
466 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
467
468 /* odd 4. pixel */
469 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
470 "mthi $zero, $ac1 \n\t"
471 "preceu.ph.qbl %[p3], %[qload1] \n\t"
472 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
473 "ulw %[qload2], 17(%[src]) \n\t"
474 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
475 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
476 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
477 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
478 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
479 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
480
481 /* odd 5. pixel */
482 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
483 "mthi $zero, $ac2 \n\t"
484 "preceu.ph.qbr %[p4], %[qload2] \n\t"
485 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
486 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
487 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
488 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
489 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
490 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
491 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
492
493 /* odd 6. pixel */
494 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
495 "mthi $zero, $ac3 \n\t"
496 "preceu.ph.qbl %[p1], %[qload2] \n\t"
497 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
498 "ulw %[qload3], 21(%[src]) \n\t"
499 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
500 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
501 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
502 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
503 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
504 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
505
506 /* odd 7. pixel */
507 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
508 "mthi $zero, $ac1 \n\t"
509 "preceu.ph.qbr %[p5], %[qload3] \n\t"
510 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
511 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
512 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
513 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
514 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
515 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
516
517 /* odd 8. pixel */
518 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
519 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
520 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
521 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
522 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
523
524 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
525 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
526 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
527
528 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
529 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
530 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
531
532 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
533 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
534 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
535 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
536 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
537 : [filter12] "r"(filter12), [filter34] "r"(filter34),
538 [filter56] "r"(filter56), [filter78] "r"(filter78),
539 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
540 [src] "r"(src));
541
542 src += 16;
543 dst += 16;
544 }
545
546 /* Next row... */
547 src_ptr += src_stride;
548 dst_ptr += dst_stride;
549 }
550 }
551
convolve_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)552 static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
553 uint8_t *dst_ptr, int32_t dst_stride,
554 const int16_t *filter_x0, int32_t h) {
555 int32_t y, c;
556 const uint8_t *src;
557 uint8_t *dst;
558 uint8_t *cm = vpx_ff_cropTbl;
559 uint32_t vector_64 = 64;
560 int32_t filter12, filter34, filter56, filter78;
561 int32_t Temp1, Temp2, Temp3;
562 uint32_t qload1, qload2, qload3;
563 uint32_t p1, p2, p3, p4, p5;
564 uint32_t st1, st2, st3;
565
566 filter12 = ((const int32_t *)filter_x0)[0];
567 filter34 = ((const int32_t *)filter_x0)[1];
568 filter56 = ((const int32_t *)filter_x0)[2];
569 filter78 = ((const int32_t *)filter_x0)[3];
570
571 for (y = h; y--;) {
572 src = src_ptr;
573 dst = dst_ptr;
574
575 /* prefetch data to cache memory */
576 prefetch_load(src_ptr + src_stride);
577 prefetch_load(src_ptr + src_stride + 32);
578 prefetch_load(src_ptr + src_stride + 64);
579 prefetch_store(dst_ptr + dst_stride);
580 prefetch_store(dst_ptr + dst_stride + 32);
581
582 for (c = 0; c < 4; c++) {
583 __asm__ __volatile__(
584 "ulw %[qload1], 0(%[src]) \n\t"
585 "ulw %[qload2], 4(%[src]) \n\t"
586
587 /* even 1. pixel */
588 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
589 "mthi $zero, $ac1 \n\t"
590 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
591 "mthi $zero, $ac2 \n\t"
592 "preceu.ph.qbr %[p1], %[qload1] \n\t"
593 "preceu.ph.qbl %[p2], %[qload1] \n\t"
594 "preceu.ph.qbr %[p3], %[qload2] \n\t"
595 "preceu.ph.qbl %[p4], %[qload2] \n\t"
596 "ulw %[qload3], 8(%[src]) \n\t"
597 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
598 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
599 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
600 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
601 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
602
603 /* even 2. pixel */
604 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
605 "mthi $zero, $ac3 \n\t"
606 "preceu.ph.qbr %[p1], %[qload3] \n\t"
607 "preceu.ph.qbl %[p5], %[qload3] \n\t"
608 "ulw %[qload1], 12(%[src]) \n\t"
609 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
610 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
611 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
612 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
613 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
614 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
615
616 /* even 3. pixel */
617 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
618 "mthi $zero, $ac1 \n\t"
619 "preceu.ph.qbr %[p2], %[qload1] \n\t"
620 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
621 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
622 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
623 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
624 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
625 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
626 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
627
628 /* even 4. pixel */
629 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
630 "mthi $zero, $ac2 \n\t"
631 "preceu.ph.qbl %[p3], %[qload1] \n\t"
632 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
633 "ulw %[qload2], 16(%[src]) \n\t"
634 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
635 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
636 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
637 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
638 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
639 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
640
641 /* even 5. pixel */
642 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
643 "mthi $zero, $ac3 \n\t"
644 "preceu.ph.qbr %[p4], %[qload2] \n\t"
645 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
646 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
647 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
648 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
649 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
650 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
651 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
652
653 /* even 6. pixel */
654 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
655 "mthi $zero, $ac1 \n\t"
656 "preceu.ph.qbl %[p1], %[qload2] \n\t"
657 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
658 "ulw %[qload3], 20(%[src]) \n\t"
659 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
660 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
661 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
662 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
663 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
664 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
665
666 /* even 7. pixel */
667 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
668 "mthi $zero, $ac2 \n\t"
669 "preceu.ph.qbr %[p5], %[qload3] \n\t"
670 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
671 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
672 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
673 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
674 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
675 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
676 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
677
678 /* even 8. pixel */
679 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
680 "mthi $zero, $ac3 \n\t"
681 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
682 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
683 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
684 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
685 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
686 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
687 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
688
689 /* ODD pixels */
690 "ulw %[qload1], 1(%[src]) \n\t"
691 "ulw %[qload2], 5(%[src]) \n\t"
692
693 /* odd 1. pixel */
694 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
695 "mthi $zero, $ac1 \n\t"
696 "preceu.ph.qbr %[p1], %[qload1] \n\t"
697 "preceu.ph.qbl %[p2], %[qload1] \n\t"
698 "preceu.ph.qbr %[p3], %[qload2] \n\t"
699 "preceu.ph.qbl %[p4], %[qload2] \n\t"
700 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
701 "ulw %[qload3], 9(%[src]) \n\t"
702 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
703 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
704 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
705 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
706 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
707 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
708
709 /* odd 2. pixel */
710 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
711 "mthi $zero, $ac2 \n\t"
712 "preceu.ph.qbr %[p1], %[qload3] \n\t"
713 "preceu.ph.qbl %[p5], %[qload3] \n\t"
714 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
715 "ulw %[qload1], 13(%[src]) \n\t"
716 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
717 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
718 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
719 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
720 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
721 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
722
723 /* odd 3. pixel */
724 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
725 "mthi $zero, $ac3 \n\t"
726 "preceu.ph.qbr %[p2], %[qload1] \n\t"
727 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
728 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
729 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
730 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
731 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
732 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
733 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
734
735 /* odd 4. pixel */
736 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
737 "mthi $zero, $ac1 \n\t"
738 "preceu.ph.qbl %[p3], %[qload1] \n\t"
739 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
740 "ulw %[qload2], 17(%[src]) \n\t"
741 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
742 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
743 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
744 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
745 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
746 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
747
748 /* odd 5. pixel */
749 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
750 "mthi $zero, $ac2 \n\t"
751 "preceu.ph.qbr %[p4], %[qload2] \n\t"
752 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
753 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
754 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
755 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
756 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
757 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
758 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
759
760 /* odd 6. pixel */
761 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
762 "mthi $zero, $ac3 \n\t"
763 "preceu.ph.qbl %[p1], %[qload2] \n\t"
764 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
765 "ulw %[qload3], 21(%[src]) \n\t"
766 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
767 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
768 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
769 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
770 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
771 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
772
773 /* odd 7. pixel */
774 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
775 "mthi $zero, $ac1 \n\t"
776 "preceu.ph.qbr %[p5], %[qload3] \n\t"
777 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
778 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
779 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
780 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
781 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
782 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
783
784 /* odd 8. pixel */
785 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
786 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
787 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
788 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
789 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
790
791 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
792 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
793 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
794
795 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
796 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
797 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
798
799 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
800 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
801 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
802 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
803 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
804 : [filter12] "r"(filter12), [filter34] "r"(filter34),
805 [filter56] "r"(filter56), [filter78] "r"(filter78),
806 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
807 [src] "r"(src));
808
809 src += 16;
810 dst += 16;
811 }
812
813 /* Next row... */
814 src_ptr += src_stride;
815 dst_ptr += dst_stride;
816 }
817 }
818
vpx_convolve8_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)819 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
820 uint8_t *dst, ptrdiff_t dst_stride,
821 const InterpKernel *filter, int x0_q4,
822 int x_step_q4, int y0_q4, int y_step_q4, int w,
823 int h) {
824 const int16_t *const filter_x = filter[x0_q4];
825 assert(x_step_q4 == 16);
826 assert(((const int32_t *)filter_x)[1] != 0x800000);
827
828 if (vpx_get_filter_taps(filter_x) == 2) {
829 vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
830 x_step_q4, y0_q4, y_step_q4, w, h);
831 } else {
832 uint32_t pos = 38;
833
834 prefetch_load((const uint8_t *)filter_x);
835 src -= 3;
836
837 /* bit positon for extract from acc */
838 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
839 :
840 : [pos] "r"(pos));
841
842 /* prefetch data to cache memory */
843 prefetch_load(src);
844 prefetch_load(src + 32);
845 prefetch_store(dst);
846
847 switch (w) {
848 case 4:
849 convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
850 (int32_t)dst_stride, filter_x, (int32_t)h);
851 break;
852 case 8:
853 convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
854 (int32_t)dst_stride, filter_x, (int32_t)h);
855 break;
856 case 16:
857 convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
858 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
859 break;
860 case 32:
861 convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
862 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
863 break;
864 case 64:
865 prefetch_load(src + 64);
866 prefetch_store(dst + 32);
867
868 convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
869 (int32_t)dst_stride, filter_x, (int32_t)h);
870 break;
871 default:
872 vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
873 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
874 break;
875 }
876 }
877 }
878 #endif
879