1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_horiz_4_dspr2(const uint8_t *src,
22 int32_t src_stride,
23 uint8_t *dst,
24 int32_t dst_stride,
25 const int16_t *filter_x0,
26 int32_t h) {
27 int32_t y;
28 uint8_t *cm = vpx_ff_cropTbl;
29 int32_t vector1b, vector2b, vector3b, vector4b;
30 int32_t Temp1, Temp2, Temp3, Temp4;
31 uint32_t vector4a = 64;
32 uint32_t tp1, tp2;
33 uint32_t p1, p2, p3, p4;
34 uint32_t n1, n2, n3, n4;
35 uint32_t tn1, tn2;
36
37 vector1b = ((const int32_t *)filter_x0)[0];
38 vector2b = ((const int32_t *)filter_x0)[1];
39 vector3b = ((const int32_t *)filter_x0)[2];
40 vector4b = ((const int32_t *)filter_x0)[3];
41
42 for (y = h; y--;) {
43 /* prefetch data to cache memory */
44 prefetch_load(src + src_stride);
45 prefetch_load(src + src_stride + 32);
46 prefetch_store(dst + dst_stride);
47
48 __asm__ __volatile__ (
49 "ulw %[tp1], 0(%[src]) \n\t"
50 "ulw %[tp2], 4(%[src]) \n\t"
51
52 /* even 1. pixel */
53 "mtlo %[vector4a], $ac3 \n\t"
54 "mthi $zero, $ac3 \n\t"
55 "preceu.ph.qbr %[p1], %[tp1] \n\t"
56 "preceu.ph.qbl %[p2], %[tp1] \n\t"
57 "preceu.ph.qbr %[p3], %[tp2] \n\t"
58 "preceu.ph.qbl %[p4], %[tp2] \n\t"
59 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
60 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
61 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
62 "ulw %[tn2], 8(%[src]) \n\t"
63 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
64 "extp %[Temp1], $ac3, 31 \n\t"
65
66 /* even 2. pixel */
67 "mtlo %[vector4a], $ac2 \n\t"
68 "mthi $zero, $ac2 \n\t"
69 "preceu.ph.qbr %[p1], %[tn2] \n\t"
70 "balign %[tn1], %[tn2], 3 \n\t"
71 "balign %[tn2], %[tp2], 3 \n\t"
72 "balign %[tp2], %[tp1], 3 \n\t"
73 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
74 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
75 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
76 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
77 "extp %[Temp3], $ac2, 31 \n\t"
78
79 /* odd 1. pixel */
80 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
81 "mtlo %[vector4a], $ac3 \n\t"
82 "mthi $zero, $ac3 \n\t"
83 "preceu.ph.qbr %[n1], %[tp2] \n\t"
84 "preceu.ph.qbl %[n2], %[tp2] \n\t"
85 "preceu.ph.qbr %[n3], %[tn2] \n\t"
86 "preceu.ph.qbl %[n4], %[tn2] \n\t"
87 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
88 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
89 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
90 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
91 "extp %[Temp2], $ac3, 31 \n\t"
92
93 /* odd 2. pixel */
94 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
95 "mtlo %[vector4a], $ac2 \n\t"
96 "mthi $zero, $ac2 \n\t"
97 "preceu.ph.qbr %[n1], %[tn1] \n\t"
98 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
99 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
100 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
101 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
102 "extp %[Temp4], $ac2, 31 \n\t"
103
104 /* clamp */
105 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
106 "lbux %[n2], %[Temp4](%[cm]) \n\t"
107
108 /* store bytes */
109 "sb %[tp1], 0(%[dst]) \n\t"
110 "sb %[tn1], 1(%[dst]) \n\t"
111 "sb %[tp2], 2(%[dst]) \n\t"
112 "sb %[n2], 3(%[dst]) \n\t"
113
114 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
115 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
116 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
117 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
118 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
119 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
120 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
121 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
122 [vector4a] "r" (vector4a),
123 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
124 );
125
126 /* Next row... */
127 src += src_stride;
128 dst += dst_stride;
129 }
130 }
131
convolve_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)132 static void convolve_horiz_8_dspr2(const uint8_t *src,
133 int32_t src_stride,
134 uint8_t *dst,
135 int32_t dst_stride,
136 const int16_t *filter_x0,
137 int32_t h) {
138 int32_t y;
139 uint8_t *cm = vpx_ff_cropTbl;
140 uint32_t vector4a = 64;
141 int32_t vector1b, vector2b, vector3b, vector4b;
142 int32_t Temp1, Temp2, Temp3;
143 uint32_t tp1, tp2;
144 uint32_t p1, p2, p3, p4, n1;
145 uint32_t tn1, tn2, tn3;
146 uint32_t st0, st1;
147
148 vector1b = ((const int32_t *)filter_x0)[0];
149 vector2b = ((const int32_t *)filter_x0)[1];
150 vector3b = ((const int32_t *)filter_x0)[2];
151 vector4b = ((const int32_t *)filter_x0)[3];
152
153 for (y = h; y--;) {
154 /* prefetch data to cache memory */
155 prefetch_load(src + src_stride);
156 prefetch_load(src + src_stride + 32);
157 prefetch_store(dst + dst_stride);
158
159 __asm__ __volatile__ (
160 "ulw %[tp1], 0(%[src]) \n\t"
161 "ulw %[tp2], 4(%[src]) \n\t"
162
163 /* even 1. pixel */
164 "mtlo %[vector4a], $ac3 \n\t"
165 "mthi $zero, $ac3 \n\t"
166 "mtlo %[vector4a], $ac2 \n\t"
167 "mthi $zero, $ac2 \n\t"
168 "preceu.ph.qbr %[p1], %[tp1] \n\t"
169 "preceu.ph.qbl %[p2], %[tp1] \n\t"
170 "preceu.ph.qbr %[p3], %[tp2] \n\t"
171 "preceu.ph.qbl %[p4], %[tp2] \n\t"
172 "ulw %[tn2], 8(%[src]) \n\t"
173 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
174 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
175 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
176 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
177 "extp %[Temp1], $ac3, 31 \n\t"
178
179 /* even 2. pixel */
180 "preceu.ph.qbr %[p1], %[tn2] \n\t"
181 "preceu.ph.qbl %[n1], %[tn2] \n\t"
182 "ulw %[tn1], 12(%[src]) \n\t"
183 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
184 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
185 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
186 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
187 "extp %[Temp3], $ac2, 31 \n\t"
188
189 /* even 3. pixel */
190 "lbux %[st0], %[Temp1](%[cm]) \n\t"
191 "mtlo %[vector4a], $ac1 \n\t"
192 "mthi $zero, $ac1 \n\t"
193 "preceu.ph.qbr %[p2], %[tn1] \n\t"
194 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
195 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
196 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
197 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
198 "extp %[Temp1], $ac1, 31 \n\t"
199
200 /* even 4. pixel */
201 "mtlo %[vector4a], $ac2 \n\t"
202 "mthi $zero, $ac2 \n\t"
203 "mtlo %[vector4a], $ac3 \n\t"
204 "mthi $zero, $ac3 \n\t"
205 "sb %[st0], 0(%[dst]) \n\t"
206 "lbux %[st1], %[Temp3](%[cm]) \n\t"
207
208 "balign %[tn3], %[tn1], 3 \n\t"
209 "balign %[tn1], %[tn2], 3 \n\t"
210 "balign %[tn2], %[tp2], 3 \n\t"
211 "balign %[tp2], %[tp1], 3 \n\t"
212
213 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
214 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
215 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
216 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
217 "extp %[Temp3], $ac2, 31 \n\t"
218
219 "lbux %[st0], %[Temp1](%[cm]) \n\t"
220
221 /* odd 1. pixel */
222 "mtlo %[vector4a], $ac1 \n\t"
223 "mthi $zero, $ac1 \n\t"
224 "sb %[st1], 2(%[dst]) \n\t"
225 "preceu.ph.qbr %[p1], %[tp2] \n\t"
226 "preceu.ph.qbl %[p2], %[tp2] \n\t"
227 "preceu.ph.qbr %[p3], %[tn2] \n\t"
228 "preceu.ph.qbl %[p4], %[tn2] \n\t"
229 "sb %[st0], 4(%[dst]) \n\t"
230 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
231 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
232 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
233 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
234 "extp %[Temp2], $ac3, 31 \n\t"
235
236 /* odd 2. pixel */
237 "mtlo %[vector4a], $ac3 \n\t"
238 "mthi $zero, $ac3 \n\t"
239 "mtlo %[vector4a], $ac2 \n\t"
240 "mthi $zero, $ac2 \n\t"
241 "preceu.ph.qbr %[p1], %[tn1] \n\t"
242 "preceu.ph.qbl %[n1], %[tn1] \n\t"
243 "lbux %[st0], %[Temp3](%[cm]) \n\t"
244 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
245 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
246 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
247 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
248 "extp %[Temp3], $ac1, 31 \n\t"
249
250 /* odd 3. pixel */
251 "lbux %[st1], %[Temp2](%[cm]) \n\t"
252 "preceu.ph.qbr %[p2], %[tn3] \n\t"
253 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
254 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
255 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
256 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
257 "extp %[Temp2], $ac3, 31 \n\t"
258
259 /* odd 4. pixel */
260 "sb %[st1], 1(%[dst]) \n\t"
261 "sb %[st0], 6(%[dst]) \n\t"
262 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
263 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
264 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
265 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
266 "extp %[Temp1], $ac2, 31 \n\t"
267
268 /* clamp */
269 "lbux %[p4], %[Temp3](%[cm]) \n\t"
270 "lbux %[p2], %[Temp2](%[cm]) \n\t"
271 "lbux %[n1], %[Temp1](%[cm]) \n\t"
272
273 /* store bytes */
274 "sb %[p4], 3(%[dst]) \n\t"
275 "sb %[p2], 5(%[dst]) \n\t"
276 "sb %[n1], 7(%[dst]) \n\t"
277
278 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
279 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
280 [st0] "=&r" (st0), [st1] "=&r" (st1),
281 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
282 [n1] "=&r" (n1),
283 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
284 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
285 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
286 [vector4a] "r" (vector4a),
287 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
288 );
289
290 /* Next row... */
291 src += src_stride;
292 dst += dst_stride;
293 }
294 }
295
convolve_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)296 static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
297 int32_t src_stride,
298 uint8_t *dst_ptr,
299 int32_t dst_stride,
300 const int16_t *filter_x0,
301 int32_t h,
302 int32_t count) {
303 int32_t y, c;
304 const uint8_t *src;
305 uint8_t *dst;
306 uint8_t *cm = vpx_ff_cropTbl;
307 uint32_t vector_64 = 64;
308 int32_t filter12, filter34, filter56, filter78;
309 int32_t Temp1, Temp2, Temp3;
310 uint32_t qload1, qload2, qload3;
311 uint32_t p1, p2, p3, p4, p5;
312 uint32_t st1, st2, st3;
313
314 filter12 = ((const int32_t *)filter_x0)[0];
315 filter34 = ((const int32_t *)filter_x0)[1];
316 filter56 = ((const int32_t *)filter_x0)[2];
317 filter78 = ((const int32_t *)filter_x0)[3];
318
319 for (y = h; y--;) {
320 src = src_ptr;
321 dst = dst_ptr;
322
323 /* prefetch data to cache memory */
324 prefetch_load(src_ptr + src_stride);
325 prefetch_load(src_ptr + src_stride + 32);
326 prefetch_store(dst_ptr + dst_stride);
327
328 for (c = 0; c < count; c++) {
329 __asm__ __volatile__ (
330 "ulw %[qload1], 0(%[src]) \n\t"
331 "ulw %[qload2], 4(%[src]) \n\t"
332
333 /* even 1. pixel */
334 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
335 "mthi $zero, $ac1 \n\t"
336 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
337 "mthi $zero, $ac2 \n\t"
338 "preceu.ph.qbr %[p1], %[qload1] \n\t"
339 "preceu.ph.qbl %[p2], %[qload1] \n\t"
340 "preceu.ph.qbr %[p3], %[qload2] \n\t"
341 "preceu.ph.qbl %[p4], %[qload2] \n\t"
342 "ulw %[qload3], 8(%[src]) \n\t"
343 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
344 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
345 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
346 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
347 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
348
349 /* even 2. pixel */
350 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
351 "mthi $zero, $ac3 \n\t"
352 "preceu.ph.qbr %[p1], %[qload3] \n\t"
353 "preceu.ph.qbl %[p5], %[qload3] \n\t"
354 "ulw %[qload1], 12(%[src]) \n\t"
355 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
356 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
357 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
358 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
359 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
360 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
361
362 /* even 3. pixel */
363 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
364 "mthi $zero, $ac1 \n\t"
365 "preceu.ph.qbr %[p2], %[qload1] \n\t"
366 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
367 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
368 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
369 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
370 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
371 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
372 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
373
374 /* even 4. pixel */
375 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
376 "mthi $zero, $ac2 \n\t"
377 "preceu.ph.qbl %[p3], %[qload1] \n\t"
378 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
379 "ulw %[qload2], 16(%[src]) \n\t"
380 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
381 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
382 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
383 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
384 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
385 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
386
387 /* even 5. pixel */
388 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
389 "mthi $zero, $ac3 \n\t"
390 "preceu.ph.qbr %[p4], %[qload2] \n\t"
391 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
392 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
393 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
394 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
395 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
396 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
397 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
398
399 /* even 6. pixel */
400 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
401 "mthi $zero, $ac1 \n\t"
402 "preceu.ph.qbl %[p1], %[qload2] \n\t"
403 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
404 "ulw %[qload3], 20(%[src]) \n\t"
405 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
406 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
407 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
408 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
409 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
410 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
411
412 /* even 7. pixel */
413 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
414 "mthi $zero, $ac2 \n\t"
415 "preceu.ph.qbr %[p5], %[qload3] \n\t"
416 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
417 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
418 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
419 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
420 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
421 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
422 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
423
424 /* even 8. pixel */
425 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
426 "mthi $zero, $ac3 \n\t"
427 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
428 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
429 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
430 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
431 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
432 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
433 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
434
435 /* ODD pixels */
436 "ulw %[qload1], 1(%[src]) \n\t"
437 "ulw %[qload2], 5(%[src]) \n\t"
438
439 /* odd 1. pixel */
440 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
441 "mthi $zero, $ac1 \n\t"
442 "preceu.ph.qbr %[p1], %[qload1] \n\t"
443 "preceu.ph.qbl %[p2], %[qload1] \n\t"
444 "preceu.ph.qbr %[p3], %[qload2] \n\t"
445 "preceu.ph.qbl %[p4], %[qload2] \n\t"
446 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
447 "ulw %[qload3], 9(%[src]) \n\t"
448 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
449 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
450 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
451 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
452 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
453 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
454
455 /* odd 2. pixel */
456 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
457 "mthi $zero, $ac2 \n\t"
458 "preceu.ph.qbr %[p1], %[qload3] \n\t"
459 "preceu.ph.qbl %[p5], %[qload3] \n\t"
460 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
461 "ulw %[qload1], 13(%[src]) \n\t"
462 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
463 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
464 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
465 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
466 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
467 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
468
469 /* odd 3. pixel */
470 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
471 "mthi $zero, $ac3 \n\t"
472 "preceu.ph.qbr %[p2], %[qload1] \n\t"
473 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
474 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
475 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
476 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
477 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
478 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
479 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
480
481 /* odd 4. pixel */
482 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
483 "mthi $zero, $ac1 \n\t"
484 "preceu.ph.qbl %[p3], %[qload1] \n\t"
485 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
486 "ulw %[qload2], 17(%[src]) \n\t"
487 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
488 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
489 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
490 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
491 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
492 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
493
494 /* odd 5. pixel */
495 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
496 "mthi $zero, $ac2 \n\t"
497 "preceu.ph.qbr %[p4], %[qload2] \n\t"
498 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
499 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
500 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
501 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
502 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
503 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
504 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
505
506 /* odd 6. pixel */
507 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
508 "mthi $zero, $ac3 \n\t"
509 "preceu.ph.qbl %[p1], %[qload2] \n\t"
510 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
511 "ulw %[qload3], 21(%[src]) \n\t"
512 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
513 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
514 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
515 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
516 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
517 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
518
519 /* odd 7. pixel */
520 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
521 "mthi $zero, $ac1 \n\t"
522 "preceu.ph.qbr %[p5], %[qload3] \n\t"
523 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
524 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
525 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
526 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
527 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
528 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
529
530 /* odd 8. pixel */
531 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
532 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
533 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
534 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
535 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
536
537 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
538 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
539 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
540
541 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
542 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
543 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
544
545 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
546 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
547 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
548 [p5] "=&r" (p5),
549 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
550 : [filter12] "r" (filter12), [filter34] "r" (filter34),
551 [filter56] "r" (filter56), [filter78] "r" (filter78),
552 [vector_64] "r" (vector_64),
553 [cm] "r" (cm), [dst] "r" (dst),
554 [src] "r" (src)
555 );
556
557 src += 16;
558 dst += 16;
559 }
560
561 /* Next row... */
562 src_ptr += src_stride;
563 dst_ptr += dst_stride;
564 }
565 }
566
convolve_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)567 static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
568 int32_t src_stride,
569 uint8_t *dst_ptr,
570 int32_t dst_stride,
571 const int16_t *filter_x0,
572 int32_t h) {
573 int32_t y, c;
574 const uint8_t *src;
575 uint8_t *dst;
576 uint8_t *cm = vpx_ff_cropTbl;
577 uint32_t vector_64 = 64;
578 int32_t filter12, filter34, filter56, filter78;
579 int32_t Temp1, Temp2, Temp3;
580 uint32_t qload1, qload2, qload3;
581 uint32_t p1, p2, p3, p4, p5;
582 uint32_t st1, st2, st3;
583
584 filter12 = ((const int32_t *)filter_x0)[0];
585 filter34 = ((const int32_t *)filter_x0)[1];
586 filter56 = ((const int32_t *)filter_x0)[2];
587 filter78 = ((const int32_t *)filter_x0)[3];
588
589 for (y = h; y--;) {
590 src = src_ptr;
591 dst = dst_ptr;
592
593 /* prefetch data to cache memory */
594 prefetch_load(src_ptr + src_stride);
595 prefetch_load(src_ptr + src_stride + 32);
596 prefetch_load(src_ptr + src_stride + 64);
597 prefetch_store(dst_ptr + dst_stride);
598 prefetch_store(dst_ptr + dst_stride + 32);
599
600 for (c = 0; c < 4; c++) {
601 __asm__ __volatile__ (
602 "ulw %[qload1], 0(%[src]) \n\t"
603 "ulw %[qload2], 4(%[src]) \n\t"
604
605 /* even 1. pixel */
606 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
607 "mthi $zero, $ac1 \n\t"
608 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
609 "mthi $zero, $ac2 \n\t"
610 "preceu.ph.qbr %[p1], %[qload1] \n\t"
611 "preceu.ph.qbl %[p2], %[qload1] \n\t"
612 "preceu.ph.qbr %[p3], %[qload2] \n\t"
613 "preceu.ph.qbl %[p4], %[qload2] \n\t"
614 "ulw %[qload3], 8(%[src]) \n\t"
615 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
616 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
617 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
618 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
619 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
620
621 /* even 2. pixel */
622 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
623 "mthi $zero, $ac3 \n\t"
624 "preceu.ph.qbr %[p1], %[qload3] \n\t"
625 "preceu.ph.qbl %[p5], %[qload3] \n\t"
626 "ulw %[qload1], 12(%[src]) \n\t"
627 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
628 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
629 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
630 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
631 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
632 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
633
634 /* even 3. pixel */
635 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
636 "mthi $zero, $ac1 \n\t"
637 "preceu.ph.qbr %[p2], %[qload1] \n\t"
638 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
639 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
640 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
641 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
642 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
643 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
644 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
645
646 /* even 4. pixel */
647 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
648 "mthi $zero, $ac2 \n\t"
649 "preceu.ph.qbl %[p3], %[qload1] \n\t"
650 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
651 "ulw %[qload2], 16(%[src]) \n\t"
652 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
653 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
654 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
655 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
656 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
657 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
658
659 /* even 5. pixel */
660 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
661 "mthi $zero, $ac3 \n\t"
662 "preceu.ph.qbr %[p4], %[qload2] \n\t"
663 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
664 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
665 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
666 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
667 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
668 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
669 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
670
671 /* even 6. pixel */
672 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
673 "mthi $zero, $ac1 \n\t"
674 "preceu.ph.qbl %[p1], %[qload2] \n\t"
675 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
676 "ulw %[qload3], 20(%[src]) \n\t"
677 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
678 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
679 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
680 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
681 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
682 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
683
684 /* even 7. pixel */
685 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
686 "mthi $zero, $ac2 \n\t"
687 "preceu.ph.qbr %[p5], %[qload3] \n\t"
688 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
689 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
690 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
691 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
692 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
693 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
694 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
695
696 /* even 8. pixel */
697 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
698 "mthi $zero, $ac3 \n\t"
699 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
700 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
701 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
702 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
703 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
704 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
705 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
706
707 /* ODD pixels */
708 "ulw %[qload1], 1(%[src]) \n\t"
709 "ulw %[qload2], 5(%[src]) \n\t"
710
711 /* odd 1. pixel */
712 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
713 "mthi $zero, $ac1 \n\t"
714 "preceu.ph.qbr %[p1], %[qload1] \n\t"
715 "preceu.ph.qbl %[p2], %[qload1] \n\t"
716 "preceu.ph.qbr %[p3], %[qload2] \n\t"
717 "preceu.ph.qbl %[p4], %[qload2] \n\t"
718 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
719 "ulw %[qload3], 9(%[src]) \n\t"
720 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
721 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
722 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
723 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
724 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
725 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
726
727 /* odd 2. pixel */
728 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
729 "mthi $zero, $ac2 \n\t"
730 "preceu.ph.qbr %[p1], %[qload3] \n\t"
731 "preceu.ph.qbl %[p5], %[qload3] \n\t"
732 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
733 "ulw %[qload1], 13(%[src]) \n\t"
734 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
735 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
736 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
737 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
738 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
739 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
740
741 /* odd 3. pixel */
742 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
743 "mthi $zero, $ac3 \n\t"
744 "preceu.ph.qbr %[p2], %[qload1] \n\t"
745 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
746 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
747 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
748 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
749 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
750 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
751 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
752
753 /* odd 4. pixel */
754 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
755 "mthi $zero, $ac1 \n\t"
756 "preceu.ph.qbl %[p3], %[qload1] \n\t"
757 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
758 "ulw %[qload2], 17(%[src]) \n\t"
759 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
760 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
761 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
762 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
763 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
764 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
765
766 /* odd 5. pixel */
767 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
768 "mthi $zero, $ac2 \n\t"
769 "preceu.ph.qbr %[p4], %[qload2] \n\t"
770 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
771 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
772 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
773 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
774 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
775 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
776 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
777
778 /* odd 6. pixel */
779 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
780 "mthi $zero, $ac3 \n\t"
781 "preceu.ph.qbl %[p1], %[qload2] \n\t"
782 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
783 "ulw %[qload3], 21(%[src]) \n\t"
784 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
785 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
786 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
787 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
788 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
789 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
790
791 /* odd 7. pixel */
792 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
793 "mthi $zero, $ac1 \n\t"
794 "preceu.ph.qbr %[p5], %[qload3] \n\t"
795 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
796 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
797 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
798 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
799 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
800 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
801
802 /* odd 8. pixel */
803 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
804 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
805 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
806 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
807 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
808
809 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
810 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
811 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
812
813 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
814 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
815 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
816
817 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
818 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
819 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
820 [p5] "=&r" (p5),
821 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
822 : [filter12] "r" (filter12), [filter34] "r" (filter34),
823 [filter56] "r" (filter56), [filter78] "r" (filter78),
824 [vector_64] "r" (vector_64),
825 [cm] "r" (cm), [dst] "r" (dst),
826 [src] "r" (src)
827 );
828
829 src += 16;
830 dst += 16;
831 }
832
833 /* Next row... */
834 src_ptr += src_stride;
835 dst_ptr += dst_stride;
836 }
837 }
838
vpx_convolve8_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)839 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
840 uint8_t *dst, ptrdiff_t dst_stride,
841 const int16_t *filter_x, int x_step_q4,
842 const int16_t *filter_y, int y_step_q4,
843 int w, int h) {
844 assert(x_step_q4 == 16);
845 assert(((const int32_t *)filter_x)[1] != 0x800000);
846
847 if (((const int32_t *)filter_x)[0] == 0) {
848 vpx_convolve2_horiz_dspr2(src, src_stride,
849 dst, dst_stride,
850 filter_x, x_step_q4,
851 filter_y, y_step_q4,
852 w, h);
853 } else {
854 uint32_t pos = 38;
855
856 prefetch_load((const uint8_t *)filter_x);
857 src -= 3;
858
859 /* bit positon for extract from acc */
860 __asm__ __volatile__ (
861 "wrdsp %[pos], 1 \n\t"
862 :
863 : [pos] "r" (pos)
864 );
865
866 /* prefetch data to cache memory */
867 prefetch_load(src);
868 prefetch_load(src + 32);
869 prefetch_store(dst);
870
871 switch (w) {
872 case 4:
873 convolve_horiz_4_dspr2(src, (int32_t)src_stride,
874 dst, (int32_t)dst_stride,
875 filter_x, (int32_t)h);
876 break;
877 case 8:
878 convolve_horiz_8_dspr2(src, (int32_t)src_stride,
879 dst, (int32_t)dst_stride,
880 filter_x, (int32_t)h);
881 break;
882 case 16:
883 convolve_horiz_16_dspr2(src, (int32_t)src_stride,
884 dst, (int32_t)dst_stride,
885 filter_x, (int32_t)h, 1);
886 break;
887 case 32:
888 convolve_horiz_16_dspr2(src, (int32_t)src_stride,
889 dst, (int32_t)dst_stride,
890 filter_x, (int32_t)h, 2);
891 break;
892 case 64:
893 prefetch_load(src + 64);
894 prefetch_store(dst + 32);
895
896 convolve_horiz_64_dspr2(src, (int32_t)src_stride,
897 dst, (int32_t)dst_stride,
898 filter_x, (int32_t)h);
899 break;
900 default:
901 vpx_convolve8_horiz_c(src + 3, src_stride,
902 dst, dst_stride,
903 filter_x, x_step_q4,
904 filter_y, y_step_q4,
905 w, h);
906 break;
907 }
908 }
909 }
910 #endif
911