• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19 
20 #if HAVE_DSPR2
convolve_avg_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)21 static void convolve_avg_vert_4_dspr2(const uint8_t *src,
22                                       int32_t src_stride,
23                                       uint8_t *dst,
24                                       int32_t dst_stride,
25                                       const int16_t *filter_y,
26                                       int32_t w,
27                                       int32_t h) {
28   int32_t       x, y;
29   const uint8_t *src_ptr;
30   uint8_t       *dst_ptr;
31   uint8_t       *cm = vpx_ff_cropTbl;
32   uint32_t      vector4a = 64;
33   uint32_t      load1, load2, load3, load4;
34   uint32_t      p1, p2;
35   uint32_t      n1, n2;
36   uint32_t      scratch1, scratch2;
37   uint32_t      store1, store2;
38   int32_t       vector1b, vector2b, vector3b, vector4b;
39   int32_t       Temp1, Temp2;
40 
41   vector1b = ((const int32_t *)filter_y)[0];
42   vector2b = ((const int32_t *)filter_y)[1];
43   vector3b = ((const int32_t *)filter_y)[2];
44   vector4b = ((const int32_t *)filter_y)[3];
45 
46   src -= 3 * src_stride;
47 
48   for (y = h; y--;) {
49     /* prefetch data to cache memory */
50     prefetch_store(dst + dst_stride);
51 
52     for (x = 0; x < w; x += 4) {
53       src_ptr = src + x;
54       dst_ptr = dst + x;
55 
56       __asm__ __volatile__ (
57           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
58           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
59           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
60           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
61           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
62           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
63           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
64 
65           "mtlo             %[vector4a],  $ac0                            \n\t"
66           "mtlo             %[vector4a],  $ac1                            \n\t"
67           "mtlo             %[vector4a],  $ac2                            \n\t"
68           "mtlo             %[vector4a],  $ac3                            \n\t"
69           "mthi             $zero,        $ac0                            \n\t"
70           "mthi             $zero,        $ac1                            \n\t"
71           "mthi             $zero,        $ac2                            \n\t"
72           "mthi             $zero,        $ac3                            \n\t"
73 
74           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
75           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
76           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
77           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
78           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
79           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
80           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
81           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
82 
83           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
84           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
85           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
86           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
87 
88           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
89           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
90           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
91           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
92           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
93           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
94           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
95           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
96 
97           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
98           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
99           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
100           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
101 
102           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
103           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
104           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
105           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
106           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
107           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
108           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
109           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
110 
111           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
112           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
113           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
114           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
115           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
116           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
117           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
118           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
119 
120           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
121           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
122           "extp             %[Temp1],     $ac0,           31              \n\t"
123           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
124           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
125           "extp             %[Temp2],     $ac1,           31              \n\t"
126 
127           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
128           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
129           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
130           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
131           "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
132           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
133           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
134           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
135           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
136           "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
137 
138           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
139           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
140           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
141           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
142           "extp             %[Temp1],     $ac2,           31              \n\t"
143 
144           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
145           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
146           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
147           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
148           "extp             %[Temp2],     $ac3,           31              \n\t"
149           "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
150 
151           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
152           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
153           "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
154 
155           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
156           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
157           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
158           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
159 
160           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
161           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
162 
163           : [load1] "=&r" (load1), [load2] "=&r" (load2),
164             [load3] "=&r" (load3), [load4] "=&r" (load4),
165             [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
166             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
167             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
168             [store1] "=&r" (store1), [store2] "=&r" (store2),
169             [src_ptr] "+r" (src_ptr)
170           : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
171             [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
172             [vector4a] "r" (vector4a),
173             [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
174       );
175     }
176 
177     /* Next row... */
178     src += src_stride;
179     dst += dst_stride;
180   }
181 }
182 
convolve_avg_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)183 static void convolve_avg_vert_64_dspr2(const uint8_t *src,
184                                        int32_t src_stride,
185                                        uint8_t *dst,
186                                        int32_t dst_stride,
187                                        const int16_t *filter_y,
188                                        int32_t h) {
189   int32_t       x, y;
190   const uint8_t *src_ptr;
191   uint8_t       *dst_ptr;
192   uint8_t       *cm = vpx_ff_cropTbl;
193   uint32_t      vector4a = 64;
194   uint32_t      load1, load2, load3, load4;
195   uint32_t      p1, p2;
196   uint32_t      n1, n2;
197   uint32_t      scratch1, scratch2;
198   uint32_t      store1, store2;
199   int32_t       vector1b, vector2b, vector3b, vector4b;
200   int32_t       Temp1, Temp2;
201 
202   vector1b = ((const int32_t *)filter_y)[0];
203   vector2b = ((const int32_t *)filter_y)[1];
204   vector3b = ((const int32_t *)filter_y)[2];
205   vector4b = ((const int32_t *)filter_y)[3];
206 
207   src -= 3 * src_stride;
208 
209   for (y = h; y--;) {
210     /* prefetch data to cache memory */
211     prefetch_store(dst + dst_stride);
212     prefetch_store(dst + dst_stride + 32);
213 
214     for (x = 0; x < 64; x += 4) {
215       src_ptr = src + x;
216       dst_ptr = dst + x;
217 
218       __asm__ __volatile__ (
219           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
220           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
221           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
222           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
223           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
224           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
225           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
226 
227           "mtlo             %[vector4a],  $ac0                            \n\t"
228           "mtlo             %[vector4a],  $ac1                            \n\t"
229           "mtlo             %[vector4a],  $ac2                            \n\t"
230           "mtlo             %[vector4a],  $ac3                            \n\t"
231           "mthi             $zero,        $ac0                            \n\t"
232           "mthi             $zero,        $ac1                            \n\t"
233           "mthi             $zero,        $ac2                            \n\t"
234           "mthi             $zero,        $ac3                            \n\t"
235 
236           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
237           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
238           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
239           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
240           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
241           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
242           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
243           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
244 
245           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
246           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
247           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
248           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
249 
250           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
251           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
252           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
253           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
254           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
255           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
256           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
257           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
258 
259           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
260           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
261           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
262           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
263 
264           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
265           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
266           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
267           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
268           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
269           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
270           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
271           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
272 
273           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
274           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
275           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
276           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
277           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
278           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
279           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
280           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
281 
282           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
283           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
284           "extp             %[Temp1],     $ac0,           31              \n\t"
285           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
286           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
287           "extp             %[Temp2],     $ac1,           31              \n\t"
288 
289           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
290           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
291           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
292           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
293           "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
294           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
295           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
296           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
297           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
298           "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
299 
300           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
301           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
302           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
303           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
304           "extp             %[Temp1],     $ac2,           31              \n\t"
305 
306           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
307           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
308           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
309           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
310           "extp             %[Temp2],     $ac3,           31              \n\t"
311           "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
312 
313           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
314           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
315           "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
316 
317           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
318           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
319           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
320           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
321 
322           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
323           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
324 
325           : [load1] "=&r" (load1), [load2] "=&r" (load2),
326             [load3] "=&r" (load3), [load4] "=&r" (load4),
327             [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
328             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
329             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
330             [store1] "=&r" (store1), [store2] "=&r" (store2),
331             [src_ptr] "+r" (src_ptr)
332           : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
333             [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
334             [vector4a] "r" (vector4a),
335             [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
336       );
337     }
338 
339     /* Next row... */
340     src += src_stride;
341     dst += dst_stride;
342   }
343 }
344 
vpx_convolve8_avg_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)345 void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
346                                   uint8_t *dst, ptrdiff_t dst_stride,
347                                   const int16_t *filter_x, int x_step_q4,
348                                   const int16_t *filter_y, int y_step_q4,
349                                   int w, int h) {
350   assert(y_step_q4 == 16);
351   assert(((const int32_t *)filter_y)[1] != 0x800000);
352 
353   if (((const int32_t *)filter_y)[0] == 0) {
354     vpx_convolve2_avg_vert_dspr2(src, src_stride,
355                                  dst, dst_stride,
356                                  filter_x, x_step_q4,
357                                  filter_y, y_step_q4,
358                                  w, h);
359   } else {
360     uint32_t pos = 38;
361 
362     /* bit positon for extract from acc */
363     __asm__ __volatile__ (
364       "wrdsp      %[pos],     1           \n\t"
365       :
366       : [pos] "r" (pos)
367     );
368 
369     prefetch_store(dst);
370 
371     switch (w) {
372       case 4:
373       case 8:
374       case 16:
375       case 32:
376         convolve_avg_vert_4_dspr2(src, src_stride,
377                                   dst, dst_stride,
378                                   filter_y, w, h);
379         break;
380       case 64:
381         prefetch_store(dst + 32);
382         convolve_avg_vert_64_dspr2(src, src_stride,
383                                    dst, dst_stride,
384                                    filter_y, h);
385         break;
386       default:
387         vpx_convolve8_avg_vert_c(src, src_stride,
388                                  dst, dst_stride,
389                                  filter_x, x_step_q4,
390                                  filter_y, y_step_q4,
391                                  w, h);
392         break;
393     }
394   }
395 }
396 
vpx_convolve8_avg_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)397 void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
398                              uint8_t *dst, ptrdiff_t dst_stride,
399                              const int16_t *filter_x, int x_step_q4,
400                              const int16_t *filter_y, int y_step_q4,
401                              int w, int h) {
402   /* Fixed size intermediate buffer places limits on parameters. */
403   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
404   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
405 
406   assert(w <= 64);
407   assert(h <= 64);
408   assert(x_step_q4 == 16);
409   assert(y_step_q4 == 16);
410 
411   if (intermediate_height < h)
412     intermediate_height = h;
413 
414   vpx_convolve8_horiz(src - (src_stride * 3), src_stride,
415                       temp, 64,
416                       filter_x, x_step_q4,
417                       filter_y, y_step_q4,
418                       w, intermediate_height);
419 
420   vpx_convolve8_avg_vert(temp + 64 * 3, 64,
421                          dst, dst_stride,
422                          filter_x, x_step_q4,
423                          filter_y, y_step_q4,
424                          w, h);
425 }
426 
vpx_convolve_avg_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)427 void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
428                             uint8_t *dst, ptrdiff_t dst_stride,
429                             const int16_t *filter_x, int filter_x_stride,
430                             const int16_t *filter_y, int filter_y_stride,
431                             int w, int h) {
432   int x, y;
433   uint32_t tp1, tp2, tn1;
434   uint32_t tp3, tp4, tn2;
435 
436   /* prefetch data to cache memory */
437   prefetch_load(src);
438   prefetch_load(src + 32);
439   prefetch_store(dst);
440 
441   switch (w) {
442     case 4:
443       /* 1 word storage */
444       for (y = h; y--; ) {
445         prefetch_load(src + src_stride);
446         prefetch_load(src + src_stride + 32);
447         prefetch_store(dst + dst_stride);
448 
449         __asm__ __volatile__ (
450             "ulw              %[tp1],         0(%[src])      \n\t"
451             "ulw              %[tp2],         0(%[dst])      \n\t"
452             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
453             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
454 
455             : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
456               [tp2] "=&r" (tp2)
457             : [src] "r" (src), [dst] "r" (dst)
458         );
459 
460         src += src_stride;
461         dst += dst_stride;
462       }
463       break;
464     case 8:
465       /* 2 word storage */
466       for (y = h; y--; ) {
467         prefetch_load(src + src_stride);
468         prefetch_load(src + src_stride + 32);
469         prefetch_store(dst + dst_stride);
470 
471         __asm__ __volatile__ (
472             "ulw              %[tp1],         0(%[src])      \n\t"
473             "ulw              %[tp2],         0(%[dst])      \n\t"
474             "ulw              %[tp3],         4(%[src])      \n\t"
475             "ulw              %[tp4],         4(%[dst])      \n\t"
476             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
477             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
478             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
479             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
480 
481             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
482               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
483               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
484             : [src] "r" (src), [dst] "r" (dst)
485         );
486 
487         src += src_stride;
488         dst += dst_stride;
489       }
490       break;
491     case 16:
492       /* 4 word storage */
493       for (y = h; y--; ) {
494         prefetch_load(src + src_stride);
495         prefetch_load(src + src_stride + 32);
496         prefetch_store(dst + dst_stride);
497 
498         __asm__ __volatile__ (
499             "ulw              %[tp1],         0(%[src])      \n\t"
500             "ulw              %[tp2],         0(%[dst])      \n\t"
501             "ulw              %[tp3],         4(%[src])      \n\t"
502             "ulw              %[tp4],         4(%[dst])      \n\t"
503             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
504             "ulw              %[tp1],         8(%[src])      \n\t"
505             "ulw              %[tp2],         8(%[dst])      \n\t"
506             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
507             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
508             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
509             "ulw              %[tp3],         12(%[src])     \n\t"
510             "ulw              %[tp4],         12(%[dst])     \n\t"
511             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
512             "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
513             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
514             "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
515 
516             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
517               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
518               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
519             : [src] "r" (src), [dst] "r" (dst)
520         );
521 
522         src += src_stride;
523         dst += dst_stride;
524       }
525       break;
526     case 32:
527       /* 8 word storage */
528       for (y = h; y--; ) {
529         prefetch_load(src + src_stride);
530         prefetch_load(src + src_stride + 32);
531         prefetch_store(dst + dst_stride);
532 
533         __asm__ __volatile__ (
534             "ulw              %[tp1],         0(%[src])      \n\t"
535             "ulw              %[tp2],         0(%[dst])      \n\t"
536             "ulw              %[tp3],         4(%[src])      \n\t"
537             "ulw              %[tp4],         4(%[dst])      \n\t"
538             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
539             "ulw              %[tp1],         8(%[src])      \n\t"
540             "ulw              %[tp2],         8(%[dst])      \n\t"
541             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
542             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
543             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
544             "ulw              %[tp3],         12(%[src])     \n\t"
545             "ulw              %[tp4],         12(%[dst])     \n\t"
546             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
547             "ulw              %[tp1],         16(%[src])     \n\t"
548             "ulw              %[tp2],         16(%[dst])     \n\t"
549             "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
550             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
551             "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
552             "ulw              %[tp3],         20(%[src])     \n\t"
553             "ulw              %[tp4],         20(%[dst])     \n\t"
554             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
555             "ulw              %[tp1],         24(%[src])     \n\t"
556             "ulw              %[tp2],         24(%[dst])     \n\t"
557             "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
558             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
559             "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
560             "ulw              %[tp3],         28(%[src])     \n\t"
561             "ulw              %[tp4],         28(%[dst])     \n\t"
562             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
563             "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
564             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
565             "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
566 
567             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
568               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
569               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
570             : [src] "r" (src), [dst] "r" (dst)
571         );
572 
573         src += src_stride;
574         dst += dst_stride;
575       }
576       break;
577     case 64:
578       prefetch_load(src + 64);
579       prefetch_store(dst + 32);
580 
581       /* 16 word storage */
582       for (y = h; y--; ) {
583         prefetch_load(src + src_stride);
584         prefetch_load(src + src_stride + 32);
585         prefetch_load(src + src_stride + 64);
586         prefetch_store(dst + dst_stride);
587         prefetch_store(dst + dst_stride + 32);
588 
589         __asm__ __volatile__ (
590             "ulw              %[tp1],         0(%[src])      \n\t"
591             "ulw              %[tp2],         0(%[dst])      \n\t"
592             "ulw              %[tp3],         4(%[src])      \n\t"
593             "ulw              %[tp4],         4(%[dst])      \n\t"
594             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
595             "ulw              %[tp1],         8(%[src])      \n\t"
596             "ulw              %[tp2],         8(%[dst])      \n\t"
597             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
598             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
599             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
600             "ulw              %[tp3],         12(%[src])     \n\t"
601             "ulw              %[tp4],         12(%[dst])     \n\t"
602             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
603             "ulw              %[tp1],         16(%[src])     \n\t"
604             "ulw              %[tp2],         16(%[dst])     \n\t"
605             "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
606             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
607             "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
608             "ulw              %[tp3],         20(%[src])     \n\t"
609             "ulw              %[tp4],         20(%[dst])     \n\t"
610             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
611             "ulw              %[tp1],         24(%[src])     \n\t"
612             "ulw              %[tp2],         24(%[dst])     \n\t"
613             "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
614             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
615             "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
616             "ulw              %[tp3],         28(%[src])     \n\t"
617             "ulw              %[tp4],         28(%[dst])     \n\t"
618             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
619             "ulw              %[tp1],         32(%[src])     \n\t"
620             "ulw              %[tp2],         32(%[dst])     \n\t"
621             "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
622             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
623             "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
624             "ulw              %[tp3],         36(%[src])     \n\t"
625             "ulw              %[tp4],         36(%[dst])     \n\t"
626             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
627             "ulw              %[tp1],         40(%[src])     \n\t"
628             "ulw              %[tp2],         40(%[dst])     \n\t"
629             "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
630             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
631             "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
632             "ulw              %[tp3],         44(%[src])     \n\t"
633             "ulw              %[tp4],         44(%[dst])     \n\t"
634             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
635             "ulw              %[tp1],         48(%[src])     \n\t"
636             "ulw              %[tp2],         48(%[dst])     \n\t"
637             "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
638             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
639             "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
640             "ulw              %[tp3],         52(%[src])     \n\t"
641             "ulw              %[tp4],         52(%[dst])     \n\t"
642             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
643             "ulw              %[tp1],         56(%[src])     \n\t"
644             "ulw              %[tp2],         56(%[dst])     \n\t"
645             "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
646             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
647             "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
648             "ulw              %[tp3],         60(%[src])     \n\t"
649             "ulw              %[tp4],         60(%[dst])     \n\t"
650             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
651             "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
652             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
653             "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
654 
655             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
656               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
657               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
658             : [src] "r" (src), [dst] "r" (dst)
659         );
660 
661         src += src_stride;
662         dst += dst_stride;
663       }
664       break;
665     default:
666       for (y = h; y > 0; --y) {
667         for (x = 0; x < w; ++x) {
668           dst[x] = (dst[x] + src[x] + 1) >> 1;
669         }
670 
671         src += src_stride;
672         dst += dst_stride;
673       }
674       break;
675   }
676 }
677 #endif
678