• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_dsp/vpx_filter.h"
19 #include "vpx_ports/mem.h"
20 
21 #if HAVE_DSPR2
convolve_avg_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)22 static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
23                                        uint8_t *dst, int32_t dst_stride,
24                                        const int16_t *filter_x0, int32_t h) {
25   int32_t y;
26   uint8_t *cm = vpx_ff_cropTbl;
27   int32_t vector1b, vector2b, vector3b, vector4b;
28   int32_t Temp1, Temp2, Temp3, Temp4;
29   uint32_t vector4a = 64;
30   uint32_t tp1, tp2;
31   uint32_t p1, p2, p3, p4;
32   uint32_t n1, n2, n3, n4;
33   uint32_t tn1, tn2;
34 
35   vector1b = ((const int32_t *)filter_x0)[0];
36   vector2b = ((const int32_t *)filter_x0)[1];
37   vector3b = ((const int32_t *)filter_x0)[2];
38   vector4b = ((const int32_t *)filter_x0)[3];
39 
40   for (y = h; y--;) {
41     /* prefetch data to cache memory */
42     prefetch_load(src + src_stride);
43     prefetch_load(src + src_stride + 32);
44     prefetch_store(dst + dst_stride);
45 
46     __asm__ __volatile__(
47         "ulw              %[tp1],         0(%[src])                      \n\t"
48         "ulw              %[tp2],         4(%[src])                      \n\t"
49 
50         /* even 1. pixel */
51         "mtlo             %[vector4a],    $ac3                           \n\t"
52         "mthi             $zero,          $ac3                           \n\t"
53         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
54         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
55         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
56         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
57         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
58         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
59         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
60         "ulw              %[tn2],         8(%[src])                      \n\t"
61         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
62         "extp             %[Temp1],       $ac3,           31             \n\t"
63 
64         /* even 2. pixel */
65         "mtlo             %[vector4a],    $ac2                           \n\t"
66         "mthi             $zero,          $ac2                           \n\t"
67         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
68         "balign           %[tn1],         %[tn2],         3              \n\t"
69         "balign           %[tn2],         %[tp2],         3              \n\t"
70         "balign           %[tp2],         %[tp1],         3              \n\t"
71         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
72         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
73         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
74         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
75         "extp             %[Temp3],       $ac2,           31             \n\t"
76 
77         "lbu              %[p2],          3(%[dst])                      \n\t" /* load odd 2 */
78 
79         /* odd 1. pixel */
80         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t" /* even 1 */
81         "mtlo             %[vector4a],    $ac3                           \n\t"
82         "mthi             $zero,          $ac3                           \n\t"
83         "lbu              %[Temp1],       1(%[dst])                      \n\t" /* load odd 1 */
84         "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
85         "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
86         "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
87         "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
88         "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
89         "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
90         "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
91         "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
92         "extp             %[Temp2],       $ac3,           31             \n\t"
93 
94         "lbu              %[tn2],         0(%[dst])                      \n\t" /* load even 1 */
95 
96         /* odd 2. pixel */
97         "lbux             %[tp2],         %[Temp3](%[cm])                \n\t" /* even 2 */
98         "mtlo             %[vector4a],    $ac2                           \n\t"
99         "mthi             $zero,          $ac2                           \n\t"
100         "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
101         "lbux             %[tn1],         %[Temp2](%[cm])                \n\t" /* odd 1 */
102         "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t" /* average even 1 */
103         "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
104         "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
105         "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
106         "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
107         "extp             %[Temp4],       $ac2,           31             \n\t"
108 
109         "lbu              %[tp1],         2(%[dst])                      \n\t" /* load even 2 */
110         "sb               %[tn2],         0(%[dst])                      \n\t" /* store even 1 */
111 
112         /* clamp */
113         "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t" /* average odd 1 */
114         "lbux             %[n2],          %[Temp4](%[cm])                \n\t" /* odd 2 */
115         "sb               %[Temp1],       1(%[dst])                      \n\t" /* store odd 1 */
116 
117         "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t" /* average even 2 */
118         "sb               %[tp1],         2(%[dst])                      \n\t" /* store even 2 */
119 
120         "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t" /* average odd 2 */
121         "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
122 
123         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
124           [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
125           [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
126           [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
127           [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
128         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
129           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
130           [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
131           [src] "r"(src));
132 
133     /* Next row... */
134     src += src_stride;
135     dst += dst_stride;
136   }
137 }
138 
convolve_avg_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)139 static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
140                                        uint8_t *dst, int32_t dst_stride,
141                                        const int16_t *filter_x0, int32_t h) {
142   int32_t y;
143   uint8_t *cm = vpx_ff_cropTbl;
144   uint32_t vector4a = 64;
145   int32_t vector1b, vector2b, vector3b, vector4b;
146   int32_t Temp1, Temp2, Temp3;
147   uint32_t tp1, tp2;
148   uint32_t p1, p2, p3, p4, n1;
149   uint32_t tn1, tn2, tn3;
150   uint32_t st0, st1;
151 
152   vector1b = ((const int32_t *)filter_x0)[0];
153   vector2b = ((const int32_t *)filter_x0)[1];
154   vector3b = ((const int32_t *)filter_x0)[2];
155   vector4b = ((const int32_t *)filter_x0)[3];
156 
157   for (y = h; y--;) {
158     /* prefetch data to cache memory */
159     prefetch_load(src + src_stride);
160     prefetch_load(src + src_stride + 32);
161     prefetch_store(dst + dst_stride);
162 
163     __asm__ __volatile__(
164         "ulw              %[tp1],         0(%[src])                      \n\t"
165         "ulw              %[tp2],         4(%[src])                      \n\t"
166 
167         /* even 1. pixel */
168         "mtlo             %[vector4a],    $ac3                           \n\t"
169         "mthi             $zero,          $ac3                           \n\t"
170         "mtlo             %[vector4a],    $ac2                           \n\t"
171         "mthi             $zero,          $ac2                           \n\t"
172         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
173         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
174         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
175         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
176         "ulw              %[tn2],         8(%[src])                      \n\t"
177         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
178         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
179         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
180         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
181         "extp             %[Temp1],       $ac3,           31             \n\t"
182         "lbu              %[Temp2],       0(%[dst])                      \n\t"
183         "lbu              %[tn3],         2(%[dst])                      \n\t"
184 
185         /* even 2. pixel */
186         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
187         "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
188         "ulw              %[tn1],         12(%[src])                     \n\t"
189         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
190         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
191         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
192         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
193         "extp             %[Temp3],       $ac2,           31             \n\t"
194 
195         /* even 3. pixel */
196         "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
197         "mtlo             %[vector4a],    $ac1                           \n\t"
198         "mthi             $zero,          $ac1                           \n\t"
199         "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
200         "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
201         "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
202         "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
203         "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
204         "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
205         "extp             %[Temp1],       $ac1,           31             \n\t"
206 
207         "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
208         "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
209         "sb               %[Temp2],       0(%[dst])                      \n\t"
210         "sb               %[tn3],         2(%[dst])                      \n\t"
211 
212         /* even 4. pixel */
213         "mtlo             %[vector4a],    $ac2                           \n\t"
214         "mthi             $zero,          $ac2                           \n\t"
215         "mtlo             %[vector4a],    $ac3                           \n\t"
216         "mthi             $zero,          $ac3                           \n\t"
217 
218         "balign           %[tn3],         %[tn1],         3              \n\t"
219         "balign           %[tn1],         %[tn2],         3              \n\t"
220         "balign           %[tn2],         %[tp2],         3              \n\t"
221         "balign           %[tp2],         %[tp1],         3              \n\t"
222 
223         "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
224         "lbu              %[Temp2],       4(%[dst])                      \n\t"
225         "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
226 
227         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
228         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
229         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
230         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
231         "extp             %[Temp3],       $ac2,           31             \n\t"
232 
233         /* odd 1. pixel */
234         "mtlo             %[vector4a],    $ac1                           \n\t"
235         "mthi             $zero,          $ac1                           \n\t"
236         "sb               %[Temp2],       4(%[dst])                      \n\t"
237         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
238         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
239         "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
240         "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
241         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
242         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
243         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
244         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
245         "extp             %[Temp2],       $ac3,           31             \n\t"
246 
247         "lbu              %[tp1],         6(%[dst])                      \n\t"
248 
249         /* odd 2. pixel */
250         "mtlo             %[vector4a],    $ac3                           \n\t"
251         "mthi             $zero,          $ac3                           \n\t"
252         "mtlo             %[vector4a],    $ac2                           \n\t"
253         "mthi             $zero,          $ac2                           \n\t"
254         "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
255         "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
256         "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
257         "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
258         "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
259         "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
260         "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
261         "extp             %[Temp3],       $ac1,           31             \n\t"
262 
263         "lbu              %[tp2],         1(%[dst])                      \n\t"
264         "lbu              %[tn2],         3(%[dst])                      \n\t"
265         "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
266 
267         /* odd 3. pixel */
268         "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
269         "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
270         "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
271         "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
272         "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
273         "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
274         "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
275         "extp             %[Temp2],       $ac3,           31             \n\t"
276 
277         "lbu              %[tn3],         5(%[dst])                      \n\t"
278 
279         /* odd 4. pixel */
280         "sb               %[tp2],         1(%[dst])                      \n\t"
281         "sb               %[tp1],         6(%[dst])                      \n\t"
282         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
283         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
284         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
285         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
286         "extp             %[Temp1],       $ac2,           31             \n\t"
287 
288         "lbu              %[tn1],         7(%[dst])                      \n\t"
289 
290         /* clamp */
291         "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
292         "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
293 
294         "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
295         "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
296 
297         "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
298         "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
299 
300         /* store bytes */
301         "sb               %[tn2],         3(%[dst])                      \n\t"
302         "sb               %[tn3],         5(%[dst])                      \n\t"
303         "sb               %[tn1],         7(%[dst])                      \n\t"
304 
305         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
306           [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
307           [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
308           [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
309           [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
310         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
311           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
312           [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
313           [src] "r"(src));
314 
315     /* Next row... */
316     src += src_stride;
317     dst += dst_stride;
318   }
319 }
320 
convolve_avg_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)321 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
322                                         int32_t src_stride, uint8_t *dst_ptr,
323                                         int32_t dst_stride,
324                                         const int16_t *filter_x0, int32_t h,
325                                         int32_t count) {
326   int32_t y, c;
327   const uint8_t *src;
328   uint8_t *dst;
329   uint8_t *cm = vpx_ff_cropTbl;
330   uint32_t vector_64 = 64;
331   int32_t filter12, filter34, filter56, filter78;
332   int32_t Temp1, Temp2, Temp3;
333   uint32_t qload1, qload2, qload3;
334   uint32_t p1, p2, p3, p4, p5;
335   uint32_t st1, st2, st3;
336 
337   filter12 = ((const int32_t *)filter_x0)[0];
338   filter34 = ((const int32_t *)filter_x0)[1];
339   filter56 = ((const int32_t *)filter_x0)[2];
340   filter78 = ((const int32_t *)filter_x0)[3];
341 
342   for (y = h; y--;) {
343     src = src_ptr;
344     dst = dst_ptr;
345 
346     /* prefetch data to cache memory */
347     prefetch_load(src_ptr + src_stride);
348     prefetch_load(src_ptr + src_stride + 32);
349     prefetch_store(dst_ptr + dst_stride);
350 
351     for (c = 0; c < count; c++) {
352       __asm__ __volatile__(
353           "ulw              %[qload1],    0(%[src])                    \n\t"
354           "ulw              %[qload2],    4(%[src])                    \n\t"
355 
356           /* even 1. pixel */
357           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
358           "mthi             $zero,        $ac1                         \n\t"
359           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
360           "mthi             $zero,        $ac2                         \n\t"
361           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
362           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
363           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
364           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
365           "ulw              %[qload3],    8(%[src])                    \n\t"
366           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
367           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
368           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
369           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
370           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
371           "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
372 
373           /* even 2. pixel */
374           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
375           "mthi             $zero,        $ac3                         \n\t"
376           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
377           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
378           "ulw              %[qload1],    12(%[src])                   \n\t"
379           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
380           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
381           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
382           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
383           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
384           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
385 
386           "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
387 
388           /* even 3. pixel */
389           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
390           "mthi             $zero,        $ac1                         \n\t"
391           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
392           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
393           "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
394           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
395           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
396           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
397           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
398           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
399           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
400 
401           /* even 4. pixel */
402           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
403           "mthi             $zero,        $ac2                         \n\t"
404           "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
405           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
406           "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
407           "ulw              %[qload2],    16(%[src])                   \n\t"
408           "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
409           "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
410           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
411           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
412           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
413           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
414           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
415           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
416 
417           /* even 5. pixel */
418           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
419           "mthi             $zero,        $ac3                         \n\t"
420           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
421           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
422           "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
423           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
424           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
425           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
426           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
427           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
428           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
429 
430           /* even 6. pixel */
431           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
432           "mthi             $zero,        $ac1                         \n\t"
433           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
434           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
435           "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
436           "ulw              %[qload3],    20(%[src])                   \n\t"
437           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
438           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
439           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
440           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
441           "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
442           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
443           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
444 
445           /* even 7. pixel */
446           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
447           "mthi             $zero,        $ac2                         \n\t"
448           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
449           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
450           "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
451           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
452           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
453           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
454           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
455           "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
456           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
457           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
458 
459           "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
460 
461           /* even 8. pixel */
462           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
463           "mthi             $zero,        $ac3                         \n\t"
464           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
465           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
466           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
467           "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
468           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
469           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
470           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
471           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
472 
473           /* ODD pixels */
474           "ulw              %[qload1],    1(%[src])                   \n\t"
475           "ulw              %[qload2],    5(%[src])                    \n\t"
476 
477           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
478 
479           /* odd 1. pixel */
480           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
481           "mthi             $zero,        $ac1                         \n\t"
482           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
483           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
484           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
485           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
486           "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
487           "ulw              %[qload3],    9(%[src])                    \n\t"
488           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
489           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
490           "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
491           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
492           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
493           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
494           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
495 
496           "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
497 
498           /* odd 2. pixel */
499           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
500           "mthi             $zero,        $ac2                         \n\t"
501           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
502           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
503           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
504           "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
505           "ulw              %[qload1],    13(%[src])                   \n\t"
506           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
507           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
508           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
509           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
510           "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
511           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
512           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
513 
514           /* odd 3. pixel */
515           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
516           "mthi             $zero,        $ac3                         \n\t"
517           "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
518           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
519           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
520           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
521           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
522           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
523           "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
524           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
525           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
526 
527           /* odd 4. pixel */
528           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
529           "mthi             $zero,        $ac1                         \n\t"
530           "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
531           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
532           "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
533           "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
534           "ulw              %[qload2],    17(%[src])                   \n\t"
535           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
536           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
537           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
538           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
539           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
540           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
541 
542           "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
543 
544           /* odd 5. pixel */
545           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
546           "mthi             $zero,        $ac2                         \n\t"
547           "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
548           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
549           "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
550           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
551           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
552           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
553           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
554           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
555           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
556 
557           "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
558 
559           /* odd 6. pixel */
560           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
561           "mthi             $zero,        $ac3                         \n\t"
562           "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
563           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
564           "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
565           "ulw              %[qload3],    21(%[src])                   \n\t"
566           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
567           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
568           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
569           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
570           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
571           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
572 
573           /* odd 7. pixel */
574           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
575           "mthi             $zero,        $ac1                         \n\t"
576           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
577           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
578           "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
579           "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
580           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
581           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
582           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
583           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
584           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
585 
586           "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
587 
588           /* odd 8. pixel */
589           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
590           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
591           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
592           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
593           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
594 
595           "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
596 
597           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
598           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
599 
600           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
601           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
602 
603           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
604           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
605 
606           "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
607           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
608           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
609 
610           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
611             [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
612             [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
613             [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
614             [Temp3] "=&r"(Temp3)
615           : [filter12] "r"(filter12), [filter34] "r"(filter34),
616             [filter56] "r"(filter56), [filter78] "r"(filter78),
617             [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
618             [src] "r"(src));
619 
620       src += 16;
621       dst += 16;
622     }
623 
624     /* Next row... */
625     src_ptr += src_stride;
626     dst_ptr += dst_stride;
627   }
628 }
629 
convolve_avg_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)630 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
631                                         int32_t src_stride, uint8_t *dst_ptr,
632                                         int32_t dst_stride,
633                                         const int16_t *filter_x0, int32_t h) {
634   int32_t y, c;
635   const uint8_t *src;
636   uint8_t *dst;
637   uint8_t *cm = vpx_ff_cropTbl;
638   uint32_t vector_64 = 64;
639   int32_t filter12, filter34, filter56, filter78;
640   int32_t Temp1, Temp2, Temp3;
641   uint32_t qload1, qload2, qload3;
642   uint32_t p1, p2, p3, p4, p5;
643   uint32_t st1, st2, st3;
644 
645   filter12 = ((const int32_t *)filter_x0)[0];
646   filter34 = ((const int32_t *)filter_x0)[1];
647   filter56 = ((const int32_t *)filter_x0)[2];
648   filter78 = ((const int32_t *)filter_x0)[3];
649 
650   for (y = h; y--;) {
651     src = src_ptr;
652     dst = dst_ptr;
653 
654     /* prefetch data to cache memory */
655     prefetch_load(src_ptr + src_stride);
656     prefetch_load(src_ptr + src_stride + 32);
657     prefetch_load(src_ptr + src_stride + 64);
658     prefetch_store(dst_ptr + dst_stride);
659     prefetch_store(dst_ptr + dst_stride + 32);
660 
661     for (c = 0; c < 4; c++) {
662       __asm__ __volatile__(
663           "ulw              %[qload1],    0(%[src])                    \n\t"
664           "ulw              %[qload2],    4(%[src])                    \n\t"
665 
666           /* even 1. pixel */
667           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
668           "mthi             $zero,        $ac1                         \n\t"
669           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
670           "mthi             $zero,        $ac2                         \n\t"
671           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
672           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
673           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
674           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
675           "ulw              %[qload3],    8(%[src])                    \n\t"
676           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
677           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
678           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
679           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
680           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
681           "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
682 
683           /* even 2. pixel */
684           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
685           "mthi             $zero,        $ac3                         \n\t"
686           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
687           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
688           "ulw              %[qload1],    12(%[src])                   \n\t"
689           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
690           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
691           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
692           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
693           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
694           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
695 
696           "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
697 
698           /* even 3. pixel */
699           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
700           "mthi             $zero,        $ac1                         \n\t"
701           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
702           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
703           "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
704           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
705           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
706           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
707           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
708           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
709           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
710 
711           /* even 4. pixel */
712           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
713           "mthi             $zero,        $ac2                         \n\t"
714           "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
715           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
716           "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
717           "ulw              %[qload2],    16(%[src])                   \n\t"
718           "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
719           "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
720           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
721           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
722           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
723           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
724           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
725           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
726 
727           /* even 5. pixel */
728           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
729           "mthi             $zero,        $ac3                         \n\t"
730           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
731           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
732           "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
733           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
734           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
735           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
736           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
737           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
738           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
739 
740           /* even 6. pixel */
741           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
742           "mthi             $zero,        $ac1                         \n\t"
743           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
744           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
745           "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
746           "ulw              %[qload3],    20(%[src])                   \n\t"
747           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
748           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
749           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
750           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
751           "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
752           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
753           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
754 
755           /* even 7. pixel */
756           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
757           "mthi             $zero,        $ac2                         \n\t"
758           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
759           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
760           "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
761           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
762           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
763           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
764           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
765           "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
766           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
767           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
768 
769           "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
770 
771           /* even 8. pixel */
772           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
773           "mthi             $zero,        $ac3                         \n\t"
774           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
775           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
776           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
777           "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
778           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
779           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
780           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
781           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
782 
783           /* ODD pixels */
784           "ulw              %[qload1],    1(%[src])                   \n\t"
785           "ulw              %[qload2],    5(%[src])                    \n\t"
786 
787           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
788 
789           /* odd 1. pixel */
790           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
791           "mthi             $zero,        $ac1                         \n\t"
792           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
793           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
794           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
795           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
796           "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
797           "ulw              %[qload3],    9(%[src])                    \n\t"
798           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
799           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
800           "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
801           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
802           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
803           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
804           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
805 
806           "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
807 
808           /* odd 2. pixel */
809           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
810           "mthi             $zero,        $ac2                         \n\t"
811           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
812           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
813           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
814           "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
815           "ulw              %[qload1],    13(%[src])                   \n\t"
816           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
817           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
818           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
819           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
820           "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
821           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
822           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
823 
824           /* odd 3. pixel */
825           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
826           "mthi             $zero,        $ac3                         \n\t"
827           "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
828           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
829           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
830           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
831           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
832           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
833           "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
834           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
835           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
836 
837           /* odd 4. pixel */
838           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
839           "mthi             $zero,        $ac1                         \n\t"
840           "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
841           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
842           "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
843           "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
844           "ulw              %[qload2],    17(%[src])                   \n\t"
845           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
846           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
847           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
848           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
849           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
850           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
851 
852           "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
853 
854           /* odd 5. pixel */
855           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
856           "mthi             $zero,        $ac2                         \n\t"
857           "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
858           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
859           "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
860           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
861           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
862           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
863           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
864           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
865           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
866 
867           "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
868 
869           /* odd 6. pixel */
870           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
871           "mthi             $zero,        $ac3                         \n\t"
872           "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
873           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
874           "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
875           "ulw              %[qload3],    21(%[src])                   \n\t"
876           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
877           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
878           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
879           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
880           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
881           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
882 
883           /* odd 7. pixel */
884           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
885           "mthi             $zero,        $ac1                         \n\t"
886           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
887           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
888           "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
889           "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
890           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
891           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
892           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
893           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
894           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
895 
896           "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
897 
898           /* odd 8. pixel */
899           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
900           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
901           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
902           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
903           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
904 
905           "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
906 
907           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
908           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
909 
910           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
911           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
912 
913           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
914           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
915 
916           "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
917           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
918           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
919 
920           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
921             [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
922             [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
923             [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
924             [Temp3] "=&r"(Temp3)
925           : [filter12] "r"(filter12), [filter34] "r"(filter34),
926             [filter56] "r"(filter56), [filter78] "r"(filter78),
927             [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
928             [src] "r"(src));
929 
930       src += 16;
931       dst += 16;
932     }
933 
934     /* Next row... */
935     src_ptr += src_stride;
936     dst_ptr += dst_stride;
937   }
938 }
939 
vpx_convolve8_avg_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)940 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
941                                    uint8_t *dst, ptrdiff_t dst_stride,
942                                    const InterpKernel *filter, int x0_q4,
943                                    int32_t x_step_q4, int y0_q4, int y_step_q4,
944                                    int w, int h) {
945   const int16_t *const filter_x = filter[x0_q4];
946   assert(x_step_q4 == 16);
947   assert(((const int32_t *)filter_x)[1] != 0x800000);
948 
949   if (vpx_get_filter_taps(filter_x) == 2) {
950     vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,
951                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
952   } else {
953     uint32_t pos = 38;
954 
955     src -= 3;
956 
957     /* bit positon for extract from acc */
958     __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
959                          :
960                          : [pos] "r"(pos));
961 
962     /* prefetch data to cache memory */
963     prefetch_load(src);
964     prefetch_load(src + 32);
965     prefetch_store(dst);
966 
967     switch (w) {
968       case 4:
969         convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
970                                    h);
971         break;
972       case 8:
973         convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
974                                    h);
975         break;
976       case 16:
977         convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
978                                     h, 1);
979         break;
980       case 32:
981         convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
982                                     h, 2);
983         break;
984       case 64:
985         prefetch_load(src + 64);
986         prefetch_store(dst + 32);
987 
988         convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
989                                     h);
990         break;
991       default:
992         vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
993                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
994         break;
995     }
996   }
997 }
998 #endif
999