1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_bi_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
22 int32_t src_stride,
23 uint8_t *dst,
24 int32_t dst_stride,
25 const int16_t *filter_x0,
26 int32_t h) {
27 int32_t y;
28 uint8_t *cm = vpx_ff_cropTbl;
29 int32_t Temp1, Temp2, Temp3, Temp4;
30 uint32_t vector4a = 64;
31 uint32_t tp1, tp2;
32 uint32_t p1, p2;
33 const int16_t *filter = &filter_x0[3];
34 uint32_t filter45;;
35
36 filter45 = ((const int32_t *)filter)[0];
37
38 for (y = h; y--;) {
39 /* prefetch data to cache memory */
40 prefetch_load(src + src_stride);
41 prefetch_load(src + src_stride + 32);
42 prefetch_store(dst + dst_stride);
43
44 __asm__ __volatile__ (
45 "ulw %[tp1], 0(%[src]) \n\t"
46 "ulw %[tp2], 4(%[src]) \n\t"
47
48 /* even 1. pixel */
49 "mtlo %[vector4a], $ac3 \n\t"
50 "mthi $zero, $ac3 \n\t"
51 "preceu.ph.qbr %[p1], %[tp1] \n\t"
52 "preceu.ph.qbl %[p2], %[tp1] \n\t"
53 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
54 "extp %[Temp1], $ac3, 31 \n\t"
55
56 /* even 2. pixel */
57 "mtlo %[vector4a], $ac2 \n\t"
58 "mthi $zero, $ac2 \n\t"
59 "balign %[tp2], %[tp1], 3 \n\t"
60 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
61 "extp %[Temp3], $ac2, 31 \n\t"
62
63 /* odd 1. pixel */
64 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
65 "mtlo %[vector4a], $ac3 \n\t"
66 "mthi $zero, $ac3 \n\t"
67 "preceu.ph.qbr %[p1], %[tp2] \n\t"
68 "preceu.ph.qbl %[p2], %[tp2] \n\t"
69 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
70 "extp %[Temp2], $ac3, 31 \n\t"
71
72 /* odd 2. pixel */
73 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
74 "mtlo %[vector4a], $ac2 \n\t"
75 "mthi $zero, $ac2 \n\t"
76 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
77 "extp %[Temp4], $ac2, 31 \n\t"
78
79 /* clamp */
80 "lbux %[p1], %[Temp2](%[cm]) \n\t"
81 "lbux %[p2], %[Temp4](%[cm]) \n\t"
82
83 /* store bytes */
84 "sb %[tp1], 0(%[dst]) \n\t"
85 "sb %[p1], 1(%[dst]) \n\t"
86 "sb %[tp2], 2(%[dst]) \n\t"
87 "sb %[p2], 3(%[dst]) \n\t"
88
89 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
90 [p1] "=&r" (p1), [p2] "=&r" (p2),
91 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
92 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
93 : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
94 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
95 );
96
97 /* Next row... */
98 src += src_stride;
99 dst += dst_stride;
100 }
101 }
102
convolve_bi_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)103 static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
104 int32_t src_stride,
105 uint8_t *dst,
106 int32_t dst_stride,
107 const int16_t *filter_x0,
108 int32_t h) {
109 int32_t y;
110 uint8_t *cm = vpx_ff_cropTbl;
111 uint32_t vector4a = 64;
112 int32_t Temp1, Temp2, Temp3;
113 uint32_t tp1, tp2, tp3;
114 uint32_t p1, p2, p3, p4;
115 uint32_t st0, st1;
116 const int16_t *filter = &filter_x0[3];
117 uint32_t filter45;;
118
119 filter45 = ((const int32_t *)filter)[0];
120
121 for (y = h; y--;) {
122 /* prefetch data to cache memory */
123 prefetch_load(src + src_stride);
124 prefetch_load(src + src_stride + 32);
125 prefetch_store(dst + dst_stride);
126
127 __asm__ __volatile__ (
128 "ulw %[tp1], 0(%[src]) \n\t"
129 "ulw %[tp2], 4(%[src]) \n\t"
130
131 /* even 1. pixel */
132 "mtlo %[vector4a], $ac3 \n\t"
133 "mthi $zero, $ac3 \n\t"
134 "mtlo %[vector4a], $ac2 \n\t"
135 "mthi $zero, $ac2 \n\t"
136 "preceu.ph.qbr %[p1], %[tp1] \n\t"
137 "preceu.ph.qbl %[p2], %[tp1] \n\t"
138 "preceu.ph.qbr %[p3], %[tp2] \n\t"
139 "preceu.ph.qbl %[p4], %[tp2] \n\t"
140 "ulw %[tp3], 8(%[src]) \n\t"
141 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
142 "extp %[Temp1], $ac3, 31 \n\t"
143
144 /* even 2. pixel */
145 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
146 "extp %[Temp3], $ac2, 31 \n\t"
147
148 /* even 3. pixel */
149 "lbux %[st0], %[Temp1](%[cm]) \n\t"
150 "mtlo %[vector4a], $ac1 \n\t"
151 "mthi $zero, $ac1 \n\t"
152 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
153 "extp %[Temp1], $ac1, 31 \n\t"
154
155 /* even 4. pixel */
156 "mtlo %[vector4a], $ac2 \n\t"
157 "mthi $zero, $ac2 \n\t"
158 "mtlo %[vector4a], $ac3 \n\t"
159 "mthi $zero, $ac3 \n\t"
160 "sb %[st0], 0(%[dst]) \n\t"
161 "lbux %[st1], %[Temp3](%[cm]) \n\t"
162
163 "balign %[tp3], %[tp2], 3 \n\t"
164 "balign %[tp2], %[tp1], 3 \n\t"
165
166 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
167 "extp %[Temp3], $ac2, 31 \n\t"
168
169 "lbux %[st0], %[Temp1](%[cm]) \n\t"
170
171 /* odd 1. pixel */
172 "mtlo %[vector4a], $ac1 \n\t"
173 "mthi $zero, $ac1 \n\t"
174 "sb %[st1], 2(%[dst]) \n\t"
175 "preceu.ph.qbr %[p1], %[tp2] \n\t"
176 "preceu.ph.qbl %[p2], %[tp2] \n\t"
177 "preceu.ph.qbr %[p3], %[tp3] \n\t"
178 "preceu.ph.qbl %[p4], %[tp3] \n\t"
179 "sb %[st0], 4(%[dst]) \n\t"
180 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
181 "extp %[Temp2], $ac3, 31 \n\t"
182
183 /* odd 2. pixel */
184 "mtlo %[vector4a], $ac3 \n\t"
185 "mthi $zero, $ac3 \n\t"
186 "mtlo %[vector4a], $ac2 \n\t"
187 "mthi $zero, $ac2 \n\t"
188 "lbux %[st0], %[Temp3](%[cm]) \n\t"
189 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
190 "extp %[Temp3], $ac1, 31 \n\t"
191
192 /* odd 3. pixel */
193 "lbux %[st1], %[Temp2](%[cm]) \n\t"
194 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
195 "extp %[Temp2], $ac3, 31 \n\t"
196
197 /* odd 4. pixel */
198 "sb %[st1], 1(%[dst]) \n\t"
199 "sb %[st0], 6(%[dst]) \n\t"
200 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
201 "extp %[Temp1], $ac2, 31 \n\t"
202
203 /* clamp */
204 "lbux %[p4], %[Temp3](%[cm]) \n\t"
205 "lbux %[p2], %[Temp2](%[cm]) \n\t"
206 "lbux %[p1], %[Temp1](%[cm]) \n\t"
207
208 /* store bytes */
209 "sb %[p4], 3(%[dst]) \n\t"
210 "sb %[p2], 5(%[dst]) \n\t"
211 "sb %[p1], 7(%[dst]) \n\t"
212
213 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
214 [st0] "=&r" (st0), [st1] "=&r" (st1),
215 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
216 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
217 : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
218 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
219 );
220
221 /* Next row... */
222 src += src_stride;
223 dst += dst_stride;
224 }
225 }
226
convolve_bi_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)227 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
228 int32_t src_stride,
229 uint8_t *dst_ptr,
230 int32_t dst_stride,
231 const int16_t *filter_x0,
232 int32_t h,
233 int32_t count) {
234 int32_t y, c;
235 const uint8_t *src;
236 uint8_t *dst;
237 uint8_t *cm = vpx_ff_cropTbl;
238 uint32_t vector_64 = 64;
239 int32_t Temp1, Temp2, Temp3;
240 uint32_t qload1, qload2, qload3;
241 uint32_t p1, p2, p3, p4, p5;
242 uint32_t st1, st2, st3;
243 const int16_t *filter = &filter_x0[3];
244 uint32_t filter45;;
245
246 filter45 = ((const int32_t *)filter)[0];
247
248 for (y = h; y--;) {
249 src = src_ptr;
250 dst = dst_ptr;
251
252 /* prefetch data to cache memory */
253 prefetch_load(src_ptr + src_stride);
254 prefetch_load(src_ptr + src_stride + 32);
255 prefetch_store(dst_ptr + dst_stride);
256
257 for (c = 0; c < count; c++) {
258 __asm__ __volatile__ (
259 "ulw %[qload1], 0(%[src]) \n\t"
260 "ulw %[qload2], 4(%[src]) \n\t"
261
262 /* even 1. pixel */
263 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
264 "mthi $zero, $ac1 \n\t"
265 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
266 "mthi $zero, $ac2 \n\t"
267 "preceu.ph.qbr %[p1], %[qload1] \n\t"
268 "preceu.ph.qbl %[p2], %[qload1] \n\t"
269 "preceu.ph.qbr %[p3], %[qload2] \n\t"
270 "preceu.ph.qbl %[p4], %[qload2] \n\t"
271 "ulw %[qload3], 8(%[src]) \n\t"
272 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
273 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
274
275 /* even 2. pixel */
276 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
277 "mthi $zero, $ac3 \n\t"
278 "preceu.ph.qbr %[p1], %[qload3] \n\t"
279 "preceu.ph.qbl %[p5], %[qload3] \n\t"
280 "ulw %[qload1], 12(%[src]) \n\t"
281 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
282 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
283 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
284
285 /* even 3. pixel */
286 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
287 "mthi $zero, $ac1 \n\t"
288 "preceu.ph.qbr %[p2], %[qload1] \n\t"
289 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
290 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
291 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
292 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
293
294 /* even 4. pixel */
295 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
296 "mthi $zero, $ac2 \n\t"
297 "preceu.ph.qbl %[p3], %[qload1] \n\t"
298 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
299 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
300 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
301 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
302
303 /* even 5. pixel */
304 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
305 "mthi $zero, $ac3 \n\t"
306 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
307 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
308 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
309 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
310
311 /* even 6. pixel */
312 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
313 "mthi $zero, $ac1 \n\t"
314 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
315 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
316 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
317 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
318
319 /* even 7. pixel */
320 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
321 "mthi $zero, $ac2 \n\t"
322 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
323 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
324 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
325 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
326
327 /* even 8. pixel */
328 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
329 "mthi $zero, $ac3 \n\t"
330 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
331 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
332 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
333 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
334
335 /* ODD pixels */
336 "ulw %[qload1], 1(%[src]) \n\t"
337 "ulw %[qload2], 5(%[src]) \n\t"
338
339 /* odd 1. pixel */
340 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
341 "mthi $zero, $ac1 \n\t"
342 "preceu.ph.qbr %[p1], %[qload1] \n\t"
343 "preceu.ph.qbl %[p2], %[qload1] \n\t"
344 "preceu.ph.qbr %[p3], %[qload2] \n\t"
345 "preceu.ph.qbl %[p4], %[qload2] \n\t"
346 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
347 "ulw %[qload3], 9(%[src]) \n\t"
348 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
349 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
350 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
351
352 /* odd 2. pixel */
353 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
354 "mthi $zero, $ac2 \n\t"
355 "preceu.ph.qbr %[p1], %[qload3] \n\t"
356 "preceu.ph.qbl %[p5], %[qload3] \n\t"
357 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
358 "ulw %[qload1], 13(%[src]) \n\t"
359 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
360 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
361 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
362
363 /* odd 3. pixel */
364 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
365 "mthi $zero, $ac3 \n\t"
366 "preceu.ph.qbr %[p2], %[qload1] \n\t"
367 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
368 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
369 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
370 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
371
372 /* odd 4. pixel */
373 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
374 "mthi $zero, $ac1 \n\t"
375 "preceu.ph.qbl %[p3], %[qload1] \n\t"
376 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
377 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
378 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
379 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
380
381 /* odd 5. pixel */
382 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
383 "mthi $zero, $ac2 \n\t"
384 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
385 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
386 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
387 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
388
389 /* odd 6. pixel */
390 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
391 "mthi $zero, $ac3 \n\t"
392 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
393 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
394 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
395 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
396
397 /* odd 7. pixel */
398 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
399 "mthi $zero, $ac1 \n\t"
400 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
401 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
402 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
403
404 /* odd 8. pixel */
405 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
406 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
407
408 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
409 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
410 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
411
412 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
413 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
414 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
415
416 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
417 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
418 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
419 [p5] "=&r" (p5),
420 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
421 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
422 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
423 );
424
425 src += 16;
426 dst += 16;
427 }
428
429 /* Next row... */
430 src_ptr += src_stride;
431 dst_ptr += dst_stride;
432 }
433 }
434
convolve_bi_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)435 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
436 int32_t src_stride,
437 uint8_t *dst_ptr,
438 int32_t dst_stride,
439 const int16_t *filter_x0,
440 int32_t h) {
441 int32_t y, c;
442 const uint8_t *src;
443 uint8_t *dst;
444 uint8_t *cm = vpx_ff_cropTbl;
445 uint32_t vector_64 = 64;
446 int32_t Temp1, Temp2, Temp3;
447 uint32_t qload1, qload2, qload3;
448 uint32_t p1, p2, p3, p4, p5;
449 uint32_t st1, st2, st3;
450 const int16_t *filter = &filter_x0[3];
451 uint32_t filter45;;
452
453 filter45 = ((const int32_t *)filter)[0];
454
455 for (y = h; y--;) {
456 src = src_ptr;
457 dst = dst_ptr;
458
459 /* prefetch data to cache memory */
460 prefetch_load(src_ptr + src_stride);
461 prefetch_load(src_ptr + src_stride + 32);
462 prefetch_load(src_ptr + src_stride + 64);
463 prefetch_store(dst_ptr + dst_stride);
464 prefetch_store(dst_ptr + dst_stride + 32);
465
466 for (c = 0; c < 4; c++) {
467 __asm__ __volatile__ (
468 "ulw %[qload1], 0(%[src]) \n\t"
469 "ulw %[qload2], 4(%[src]) \n\t"
470
471 /* even 1. pixel */
472 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
473 "mthi $zero, $ac1 \n\t"
474 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
475 "mthi $zero, $ac2 \n\t"
476 "preceu.ph.qbr %[p1], %[qload1] \n\t"
477 "preceu.ph.qbl %[p2], %[qload1] \n\t"
478 "preceu.ph.qbr %[p3], %[qload2] \n\t"
479 "preceu.ph.qbl %[p4], %[qload2] \n\t"
480 "ulw %[qload3], 8(%[src]) \n\t"
481 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
482 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
483
484 /* even 2. pixel */
485 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
486 "mthi $zero, $ac3 \n\t"
487 "preceu.ph.qbr %[p1], %[qload3] \n\t"
488 "preceu.ph.qbl %[p5], %[qload3] \n\t"
489 "ulw %[qload1], 12(%[src]) \n\t"
490 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
491 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
492 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
493
494 /* even 3. pixel */
495 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
496 "mthi $zero, $ac1 \n\t"
497 "preceu.ph.qbr %[p2], %[qload1] \n\t"
498 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
499 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
500 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
501 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
502
503 /* even 4. pixel */
504 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
505 "mthi $zero, $ac2 \n\t"
506 "preceu.ph.qbl %[p3], %[qload1] \n\t"
507 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
508 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
509 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
510 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
511
512 /* even 5. pixel */
513 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
514 "mthi $zero, $ac3 \n\t"
515 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
516 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
517 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
518 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
519
520 /* even 6. pixel */
521 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
522 "mthi $zero, $ac1 \n\t"
523 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
524 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
525 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
526 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
527
528 /* even 7. pixel */
529 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
530 "mthi $zero, $ac2 \n\t"
531 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
532 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
533 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
534 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
535
536 /* even 8. pixel */
537 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
538 "mthi $zero, $ac3 \n\t"
539 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
540 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
541 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
542 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
543
544 /* ODD pixels */
545 "ulw %[qload1], 1(%[src]) \n\t"
546 "ulw %[qload2], 5(%[src]) \n\t"
547
548 /* odd 1. pixel */
549 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
550 "mthi $zero, $ac1 \n\t"
551 "preceu.ph.qbr %[p1], %[qload1] \n\t"
552 "preceu.ph.qbl %[p2], %[qload1] \n\t"
553 "preceu.ph.qbr %[p3], %[qload2] \n\t"
554 "preceu.ph.qbl %[p4], %[qload2] \n\t"
555 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
556 "ulw %[qload3], 9(%[src]) \n\t"
557 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
558 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
559 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
560
561 /* odd 2. pixel */
562 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
563 "mthi $zero, $ac2 \n\t"
564 "preceu.ph.qbr %[p1], %[qload3] \n\t"
565 "preceu.ph.qbl %[p5], %[qload3] \n\t"
566 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
567 "ulw %[qload1], 13(%[src]) \n\t"
568 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
569 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
570 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
571
572 /* odd 3. pixel */
573 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
574 "mthi $zero, $ac3 \n\t"
575 "preceu.ph.qbr %[p2], %[qload1] \n\t"
576 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
577 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
578 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
579 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
580
581 /* odd 4. pixel */
582 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
583 "mthi $zero, $ac1 \n\t"
584 "preceu.ph.qbl %[p3], %[qload1] \n\t"
585 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
586 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
587 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
588 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
589
590 /* odd 5. pixel */
591 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
592 "mthi $zero, $ac2 \n\t"
593 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
594 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
595 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
596 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
597
598 /* odd 6. pixel */
599 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
600 "mthi $zero, $ac3 \n\t"
601 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
602 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
603 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
604 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
605
606 /* odd 7. pixel */
607 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
608 "mthi $zero, $ac1 \n\t"
609 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
610 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
611 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
612
613 /* odd 8. pixel */
614 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
615 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
616
617 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
618 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
619 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
620
621 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
622 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
623 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
624
625 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
626 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
627 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
628 [p5] "=&r" (p5),
629 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
630 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
631 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
632 );
633
634 src += 16;
635 dst += 16;
636 }
637
638 /* Next row... */
639 src_ptr += src_stride;
640 dst_ptr += dst_stride;
641 }
642 }
643
vpx_convolve2_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)644 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
645 uint8_t *dst, ptrdiff_t dst_stride,
646 const int16_t *filter_x, int x_step_q4,
647 const int16_t *filter_y, int y_step_q4,
648 int w, int h) {
649 uint32_t pos = 38;
650
651 assert(x_step_q4 == 16);
652
653 prefetch_load((const uint8_t *)filter_x);
654
655 /* bit positon for extract from acc */
656 __asm__ __volatile__ (
657 "wrdsp %[pos], 1 \n\t"
658 :
659 : [pos] "r" (pos)
660 );
661
662 /* prefetch data to cache memory */
663 prefetch_load(src);
664 prefetch_load(src + 32);
665 prefetch_store(dst);
666
667 switch (w) {
668 case 4:
669 convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
670 dst, (int32_t)dst_stride,
671 filter_x, (int32_t)h);
672 break;
673 case 8:
674 convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
675 dst, (int32_t)dst_stride,
676 filter_x, (int32_t)h);
677 break;
678 case 16:
679 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
680 dst, (int32_t)dst_stride,
681 filter_x, (int32_t)h, 1);
682 break;
683 case 32:
684 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
685 dst, (int32_t)dst_stride,
686 filter_x, (int32_t)h, 2);
687 break;
688 case 64:
689 prefetch_load(src + 64);
690 prefetch_store(dst + 32);
691
692 convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
693 dst, (int32_t)dst_stride,
694 filter_x, (int32_t)h);
695 break;
696 default:
697 vpx_convolve8_horiz_c(src, src_stride,
698 dst, dst_stride,
699 filter_x, x_step_q4,
700 filter_y, y_step_q4,
701 w, h);
702 break;
703 }
704 }
705 #endif
706