1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19
20 #if HAVE_DSPR2
convolve_avg_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)21 static void convolve_avg_vert_4_dspr2(const uint8_t *src,
22 int32_t src_stride,
23 uint8_t *dst,
24 int32_t dst_stride,
25 const int16_t *filter_y,
26 int32_t w,
27 int32_t h) {
28 int32_t x, y;
29 const uint8_t *src_ptr;
30 uint8_t *dst_ptr;
31 uint8_t *cm = vpx_ff_cropTbl;
32 uint32_t vector4a = 64;
33 uint32_t load1, load2, load3, load4;
34 uint32_t p1, p2;
35 uint32_t n1, n2;
36 uint32_t scratch1, scratch2;
37 uint32_t store1, store2;
38 int32_t vector1b, vector2b, vector3b, vector4b;
39 int32_t Temp1, Temp2;
40
41 vector1b = ((const int32_t *)filter_y)[0];
42 vector2b = ((const int32_t *)filter_y)[1];
43 vector3b = ((const int32_t *)filter_y)[2];
44 vector4b = ((const int32_t *)filter_y)[3];
45
46 src -= 3 * src_stride;
47
48 for (y = h; y--;) {
49 /* prefetch data to cache memory */
50 prefetch_store(dst + dst_stride);
51
52 for (x = 0; x < w; x += 4) {
53 src_ptr = src + x;
54 dst_ptr = dst + x;
55
56 __asm__ __volatile__ (
57 "ulw %[load1], 0(%[src_ptr]) \n\t"
58 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
59 "ulw %[load2], 0(%[src_ptr]) \n\t"
60 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
61 "ulw %[load3], 0(%[src_ptr]) \n\t"
62 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
63 "ulw %[load4], 0(%[src_ptr]) \n\t"
64
65 "mtlo %[vector4a], $ac0 \n\t"
66 "mtlo %[vector4a], $ac1 \n\t"
67 "mtlo %[vector4a], $ac2 \n\t"
68 "mtlo %[vector4a], $ac3 \n\t"
69 "mthi $zero, $ac0 \n\t"
70 "mthi $zero, $ac1 \n\t"
71 "mthi $zero, $ac2 \n\t"
72 "mthi $zero, $ac3 \n\t"
73
74 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
75 "preceu.ph.qbr %[p1], %[load2] \n\t"
76 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
77 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
78 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
79 "preceu.ph.qbr %[p2], %[load4] \n\t"
80 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
81 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
82
83 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
84 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
85 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
86 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
87
88 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
89 "preceu.ph.qbl %[p1], %[load2] \n\t"
90 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
91 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
92 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
93 "preceu.ph.qbl %[p2], %[load4] \n\t"
94 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
95 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
96
97 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
98 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
99 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
100 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
101
102 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
103 "ulw %[load1], 0(%[src_ptr]) \n\t"
104 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
105 "ulw %[load2], 0(%[src_ptr]) \n\t"
106 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
107 "ulw %[load3], 0(%[src_ptr]) \n\t"
108 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
109 "ulw %[load4], 0(%[src_ptr]) \n\t"
110
111 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
112 "preceu.ph.qbr %[p1], %[load2] \n\t"
113 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
114 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
115 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
116 "preceu.ph.qbr %[p2], %[load4] \n\t"
117 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
118 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
119
120 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
121 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
122 "extp %[Temp1], $ac0, 31 \n\t"
123 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
124 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
125 "extp %[Temp2], $ac1, 31 \n\t"
126
127 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
128 "preceu.ph.qbl %[p1], %[load2] \n\t"
129 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
130 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
131 "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
132 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
133 "preceu.ph.qbl %[p2], %[load4] \n\t"
134 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
135 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
136 "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
137
138 "lbux %[store1], %[Temp1](%[cm]) \n\t"
139 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
140 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
141 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
142 "extp %[Temp1], $ac2, 31 \n\t"
143
144 "lbux %[store2], %[Temp2](%[cm]) \n\t"
145 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
146 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
147 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
148 "extp %[Temp2], $ac3, 31 \n\t"
149 "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
150
151 "sb %[store1], 0(%[dst_ptr]) \n\t"
152 "sb %[store2], 1(%[dst_ptr]) \n\t"
153 "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
154
155 "lbux %[store1], %[Temp1](%[cm]) \n\t"
156 "lbux %[store2], %[Temp2](%[cm]) \n\t"
157 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
158 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
159
160 "sb %[store1], 2(%[dst_ptr]) \n\t"
161 "sb %[store2], 3(%[dst_ptr]) \n\t"
162
163 : [load1] "=&r" (load1), [load2] "=&r" (load2),
164 [load3] "=&r" (load3), [load4] "=&r" (load4),
165 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
166 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
167 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
168 [store1] "=&r" (store1), [store2] "=&r" (store2),
169 [src_ptr] "+r" (src_ptr)
170 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
171 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
172 [vector4a] "r" (vector4a),
173 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
174 );
175 }
176
177 /* Next row... */
178 src += src_stride;
179 dst += dst_stride;
180 }
181 }
182
convolve_avg_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)183 static void convolve_avg_vert_64_dspr2(const uint8_t *src,
184 int32_t src_stride,
185 uint8_t *dst,
186 int32_t dst_stride,
187 const int16_t *filter_y,
188 int32_t h) {
189 int32_t x, y;
190 const uint8_t *src_ptr;
191 uint8_t *dst_ptr;
192 uint8_t *cm = vpx_ff_cropTbl;
193 uint32_t vector4a = 64;
194 uint32_t load1, load2, load3, load4;
195 uint32_t p1, p2;
196 uint32_t n1, n2;
197 uint32_t scratch1, scratch2;
198 uint32_t store1, store2;
199 int32_t vector1b, vector2b, vector3b, vector4b;
200 int32_t Temp1, Temp2;
201
202 vector1b = ((const int32_t *)filter_y)[0];
203 vector2b = ((const int32_t *)filter_y)[1];
204 vector3b = ((const int32_t *)filter_y)[2];
205 vector4b = ((const int32_t *)filter_y)[3];
206
207 src -= 3 * src_stride;
208
209 for (y = h; y--;) {
210 /* prefetch data to cache memory */
211 prefetch_store(dst + dst_stride);
212 prefetch_store(dst + dst_stride + 32);
213
214 for (x = 0; x < 64; x += 4) {
215 src_ptr = src + x;
216 dst_ptr = dst + x;
217
218 __asm__ __volatile__ (
219 "ulw %[load1], 0(%[src_ptr]) \n\t"
220 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
221 "ulw %[load2], 0(%[src_ptr]) \n\t"
222 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
223 "ulw %[load3], 0(%[src_ptr]) \n\t"
224 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
225 "ulw %[load4], 0(%[src_ptr]) \n\t"
226
227 "mtlo %[vector4a], $ac0 \n\t"
228 "mtlo %[vector4a], $ac1 \n\t"
229 "mtlo %[vector4a], $ac2 \n\t"
230 "mtlo %[vector4a], $ac3 \n\t"
231 "mthi $zero, $ac0 \n\t"
232 "mthi $zero, $ac1 \n\t"
233 "mthi $zero, $ac2 \n\t"
234 "mthi $zero, $ac3 \n\t"
235
236 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
237 "preceu.ph.qbr %[p1], %[load2] \n\t"
238 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
239 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
240 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
241 "preceu.ph.qbr %[p2], %[load4] \n\t"
242 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
243 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
244
245 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
246 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
247 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
248 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
249
250 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
251 "preceu.ph.qbl %[p1], %[load2] \n\t"
252 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
253 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
254 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
255 "preceu.ph.qbl %[p2], %[load4] \n\t"
256 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
257 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
258
259 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
260 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
261 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
262 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
263
264 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
265 "ulw %[load1], 0(%[src_ptr]) \n\t"
266 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
267 "ulw %[load2], 0(%[src_ptr]) \n\t"
268 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
269 "ulw %[load3], 0(%[src_ptr]) \n\t"
270 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
271 "ulw %[load4], 0(%[src_ptr]) \n\t"
272
273 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
274 "preceu.ph.qbr %[p1], %[load2] \n\t"
275 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
276 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
277 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
278 "preceu.ph.qbr %[p2], %[load4] \n\t"
279 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
280 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
281
282 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
283 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
284 "extp %[Temp1], $ac0, 31 \n\t"
285 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
286 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
287 "extp %[Temp2], $ac1, 31 \n\t"
288
289 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
290 "preceu.ph.qbl %[p1], %[load2] \n\t"
291 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
292 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
293 "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
294 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
295 "preceu.ph.qbl %[p2], %[load4] \n\t"
296 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
297 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
298 "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
299
300 "lbux %[store1], %[Temp1](%[cm]) \n\t"
301 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
302 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
303 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
304 "extp %[Temp1], $ac2, 31 \n\t"
305
306 "lbux %[store2], %[Temp2](%[cm]) \n\t"
307 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
308 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
309 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
310 "extp %[Temp2], $ac3, 31 \n\t"
311 "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
312
313 "sb %[store1], 0(%[dst_ptr]) \n\t"
314 "sb %[store2], 1(%[dst_ptr]) \n\t"
315 "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
316
317 "lbux %[store1], %[Temp1](%[cm]) \n\t"
318 "lbux %[store2], %[Temp2](%[cm]) \n\t"
319 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
320 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
321
322 "sb %[store1], 2(%[dst_ptr]) \n\t"
323 "sb %[store2], 3(%[dst_ptr]) \n\t"
324
325 : [load1] "=&r" (load1), [load2] "=&r" (load2),
326 [load3] "=&r" (load3), [load4] "=&r" (load4),
327 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
328 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
329 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
330 [store1] "=&r" (store1), [store2] "=&r" (store2),
331 [src_ptr] "+r" (src_ptr)
332 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
333 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
334 [vector4a] "r" (vector4a),
335 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
336 );
337 }
338
339 /* Next row... */
340 src += src_stride;
341 dst += dst_stride;
342 }
343 }
344
vpx_convolve8_avg_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)345 void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
346 uint8_t *dst, ptrdiff_t dst_stride,
347 const int16_t *filter_x, int x_step_q4,
348 const int16_t *filter_y, int y_step_q4,
349 int w, int h) {
350 assert(y_step_q4 == 16);
351 assert(((const int32_t *)filter_y)[1] != 0x800000);
352
353 if (((const int32_t *)filter_y)[0] == 0) {
354 vpx_convolve2_avg_vert_dspr2(src, src_stride,
355 dst, dst_stride,
356 filter_x, x_step_q4,
357 filter_y, y_step_q4,
358 w, h);
359 } else {
360 uint32_t pos = 38;
361
362 /* bit positon for extract from acc */
363 __asm__ __volatile__ (
364 "wrdsp %[pos], 1 \n\t"
365 :
366 : [pos] "r" (pos)
367 );
368
369 prefetch_store(dst);
370
371 switch (w) {
372 case 4:
373 case 8:
374 case 16:
375 case 32:
376 convolve_avg_vert_4_dspr2(src, src_stride,
377 dst, dst_stride,
378 filter_y, w, h);
379 break;
380 case 64:
381 prefetch_store(dst + 32);
382 convolve_avg_vert_64_dspr2(src, src_stride,
383 dst, dst_stride,
384 filter_y, h);
385 break;
386 default:
387 vpx_convolve8_avg_vert_c(src, src_stride,
388 dst, dst_stride,
389 filter_x, x_step_q4,
390 filter_y, y_step_q4,
391 w, h);
392 break;
393 }
394 }
395 }
396
vpx_convolve8_avg_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)397 void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
398 uint8_t *dst, ptrdiff_t dst_stride,
399 const int16_t *filter_x, int x_step_q4,
400 const int16_t *filter_y, int y_step_q4,
401 int w, int h) {
402 /* Fixed size intermediate buffer places limits on parameters. */
403 DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
404 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
405
406 assert(w <= 64);
407 assert(h <= 64);
408 assert(x_step_q4 == 16);
409 assert(y_step_q4 == 16);
410
411 if (intermediate_height < h)
412 intermediate_height = h;
413
414 vpx_convolve8_horiz(src - (src_stride * 3), src_stride,
415 temp, 64,
416 filter_x, x_step_q4,
417 filter_y, y_step_q4,
418 w, intermediate_height);
419
420 vpx_convolve8_avg_vert(temp + 64 * 3, 64,
421 dst, dst_stride,
422 filter_x, x_step_q4,
423 filter_y, y_step_q4,
424 w, h);
425 }
426
vpx_convolve_avg_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)427 void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
428 uint8_t *dst, ptrdiff_t dst_stride,
429 const int16_t *filter_x, int filter_x_stride,
430 const int16_t *filter_y, int filter_y_stride,
431 int w, int h) {
432 int x, y;
433 uint32_t tp1, tp2, tn1;
434 uint32_t tp3, tp4, tn2;
435
436 /* prefetch data to cache memory */
437 prefetch_load(src);
438 prefetch_load(src + 32);
439 prefetch_store(dst);
440
441 switch (w) {
442 case 4:
443 /* 1 word storage */
444 for (y = h; y--; ) {
445 prefetch_load(src + src_stride);
446 prefetch_load(src + src_stride + 32);
447 prefetch_store(dst + dst_stride);
448
449 __asm__ __volatile__ (
450 "ulw %[tp1], 0(%[src]) \n\t"
451 "ulw %[tp2], 0(%[dst]) \n\t"
452 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
453 "sw %[tn1], 0(%[dst]) \n\t" /* store */
454
455 : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
456 [tp2] "=&r" (tp2)
457 : [src] "r" (src), [dst] "r" (dst)
458 );
459
460 src += src_stride;
461 dst += dst_stride;
462 }
463 break;
464 case 8:
465 /* 2 word storage */
466 for (y = h; y--; ) {
467 prefetch_load(src + src_stride);
468 prefetch_load(src + src_stride + 32);
469 prefetch_store(dst + dst_stride);
470
471 __asm__ __volatile__ (
472 "ulw %[tp1], 0(%[src]) \n\t"
473 "ulw %[tp2], 0(%[dst]) \n\t"
474 "ulw %[tp3], 4(%[src]) \n\t"
475 "ulw %[tp4], 4(%[dst]) \n\t"
476 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
477 "sw %[tn1], 0(%[dst]) \n\t" /* store */
478 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
479 "sw %[tn2], 4(%[dst]) \n\t" /* store */
480
481 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
482 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
483 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
484 : [src] "r" (src), [dst] "r" (dst)
485 );
486
487 src += src_stride;
488 dst += dst_stride;
489 }
490 break;
491 case 16:
492 /* 4 word storage */
493 for (y = h; y--; ) {
494 prefetch_load(src + src_stride);
495 prefetch_load(src + src_stride + 32);
496 prefetch_store(dst + dst_stride);
497
498 __asm__ __volatile__ (
499 "ulw %[tp1], 0(%[src]) \n\t"
500 "ulw %[tp2], 0(%[dst]) \n\t"
501 "ulw %[tp3], 4(%[src]) \n\t"
502 "ulw %[tp4], 4(%[dst]) \n\t"
503 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
504 "ulw %[tp1], 8(%[src]) \n\t"
505 "ulw %[tp2], 8(%[dst]) \n\t"
506 "sw %[tn1], 0(%[dst]) \n\t" /* store */
507 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
508 "sw %[tn2], 4(%[dst]) \n\t" /* store */
509 "ulw %[tp3], 12(%[src]) \n\t"
510 "ulw %[tp4], 12(%[dst]) \n\t"
511 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
512 "sw %[tn1], 8(%[dst]) \n\t" /* store */
513 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
514 "sw %[tn2], 12(%[dst]) \n\t" /* store */
515
516 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
517 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
518 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
519 : [src] "r" (src), [dst] "r" (dst)
520 );
521
522 src += src_stride;
523 dst += dst_stride;
524 }
525 break;
526 case 32:
527 /* 8 word storage */
528 for (y = h; y--; ) {
529 prefetch_load(src + src_stride);
530 prefetch_load(src + src_stride + 32);
531 prefetch_store(dst + dst_stride);
532
533 __asm__ __volatile__ (
534 "ulw %[tp1], 0(%[src]) \n\t"
535 "ulw %[tp2], 0(%[dst]) \n\t"
536 "ulw %[tp3], 4(%[src]) \n\t"
537 "ulw %[tp4], 4(%[dst]) \n\t"
538 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
539 "ulw %[tp1], 8(%[src]) \n\t"
540 "ulw %[tp2], 8(%[dst]) \n\t"
541 "sw %[tn1], 0(%[dst]) \n\t" /* store */
542 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
543 "sw %[tn2], 4(%[dst]) \n\t" /* store */
544 "ulw %[tp3], 12(%[src]) \n\t"
545 "ulw %[tp4], 12(%[dst]) \n\t"
546 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
547 "ulw %[tp1], 16(%[src]) \n\t"
548 "ulw %[tp2], 16(%[dst]) \n\t"
549 "sw %[tn1], 8(%[dst]) \n\t" /* store */
550 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
551 "sw %[tn2], 12(%[dst]) \n\t" /* store */
552 "ulw %[tp3], 20(%[src]) \n\t"
553 "ulw %[tp4], 20(%[dst]) \n\t"
554 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
555 "ulw %[tp1], 24(%[src]) \n\t"
556 "ulw %[tp2], 24(%[dst]) \n\t"
557 "sw %[tn1], 16(%[dst]) \n\t" /* store */
558 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
559 "sw %[tn2], 20(%[dst]) \n\t" /* store */
560 "ulw %[tp3], 28(%[src]) \n\t"
561 "ulw %[tp4], 28(%[dst]) \n\t"
562 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
563 "sw %[tn1], 24(%[dst]) \n\t" /* store */
564 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
565 "sw %[tn2], 28(%[dst]) \n\t" /* store */
566
567 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
568 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
569 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
570 : [src] "r" (src), [dst] "r" (dst)
571 );
572
573 src += src_stride;
574 dst += dst_stride;
575 }
576 break;
577 case 64:
578 prefetch_load(src + 64);
579 prefetch_store(dst + 32);
580
581 /* 16 word storage */
582 for (y = h; y--; ) {
583 prefetch_load(src + src_stride);
584 prefetch_load(src + src_stride + 32);
585 prefetch_load(src + src_stride + 64);
586 prefetch_store(dst + dst_stride);
587 prefetch_store(dst + dst_stride + 32);
588
589 __asm__ __volatile__ (
590 "ulw %[tp1], 0(%[src]) \n\t"
591 "ulw %[tp2], 0(%[dst]) \n\t"
592 "ulw %[tp3], 4(%[src]) \n\t"
593 "ulw %[tp4], 4(%[dst]) \n\t"
594 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
595 "ulw %[tp1], 8(%[src]) \n\t"
596 "ulw %[tp2], 8(%[dst]) \n\t"
597 "sw %[tn1], 0(%[dst]) \n\t" /* store */
598 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
599 "sw %[tn2], 4(%[dst]) \n\t" /* store */
600 "ulw %[tp3], 12(%[src]) \n\t"
601 "ulw %[tp4], 12(%[dst]) \n\t"
602 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
603 "ulw %[tp1], 16(%[src]) \n\t"
604 "ulw %[tp2], 16(%[dst]) \n\t"
605 "sw %[tn1], 8(%[dst]) \n\t" /* store */
606 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
607 "sw %[tn2], 12(%[dst]) \n\t" /* store */
608 "ulw %[tp3], 20(%[src]) \n\t"
609 "ulw %[tp4], 20(%[dst]) \n\t"
610 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
611 "ulw %[tp1], 24(%[src]) \n\t"
612 "ulw %[tp2], 24(%[dst]) \n\t"
613 "sw %[tn1], 16(%[dst]) \n\t" /* store */
614 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
615 "sw %[tn2], 20(%[dst]) \n\t" /* store */
616 "ulw %[tp3], 28(%[src]) \n\t"
617 "ulw %[tp4], 28(%[dst]) \n\t"
618 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
619 "ulw %[tp1], 32(%[src]) \n\t"
620 "ulw %[tp2], 32(%[dst]) \n\t"
621 "sw %[tn1], 24(%[dst]) \n\t" /* store */
622 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
623 "sw %[tn2], 28(%[dst]) \n\t" /* store */
624 "ulw %[tp3], 36(%[src]) \n\t"
625 "ulw %[tp4], 36(%[dst]) \n\t"
626 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
627 "ulw %[tp1], 40(%[src]) \n\t"
628 "ulw %[tp2], 40(%[dst]) \n\t"
629 "sw %[tn1], 32(%[dst]) \n\t" /* store */
630 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
631 "sw %[tn2], 36(%[dst]) \n\t" /* store */
632 "ulw %[tp3], 44(%[src]) \n\t"
633 "ulw %[tp4], 44(%[dst]) \n\t"
634 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
635 "ulw %[tp1], 48(%[src]) \n\t"
636 "ulw %[tp2], 48(%[dst]) \n\t"
637 "sw %[tn1], 40(%[dst]) \n\t" /* store */
638 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
639 "sw %[tn2], 44(%[dst]) \n\t" /* store */
640 "ulw %[tp3], 52(%[src]) \n\t"
641 "ulw %[tp4], 52(%[dst]) \n\t"
642 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
643 "ulw %[tp1], 56(%[src]) \n\t"
644 "ulw %[tp2], 56(%[dst]) \n\t"
645 "sw %[tn1], 48(%[dst]) \n\t" /* store */
646 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
647 "sw %[tn2], 52(%[dst]) \n\t" /* store */
648 "ulw %[tp3], 60(%[src]) \n\t"
649 "ulw %[tp4], 60(%[dst]) \n\t"
650 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
651 "sw %[tn1], 56(%[dst]) \n\t" /* store */
652 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
653 "sw %[tn2], 60(%[dst]) \n\t" /* store */
654
655 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
656 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
657 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
658 : [src] "r" (src), [dst] "r" (dst)
659 );
660
661 src += src_stride;
662 dst += dst_stride;
663 }
664 break;
665 default:
666 for (y = h; y > 0; --y) {
667 for (x = 0; x < w; ++x) {
668 dst[x] = (dst[x] + src[x] + 1) >> 1;
669 }
670
671 src += src_stride;
672 dst += dst_stride;
673 }
674 break;
675 }
676 }
677 #endif
678