• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19 
20 #if HAVE_DSPR2
convolve_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_horiz_4_dspr2(const uint8_t *src,
22                                    int32_t src_stride,
23                                    uint8_t *dst,
24                                    int32_t dst_stride,
25                                    const int16_t *filter_x0,
26                                    int32_t h) {
27   int32_t y;
28   uint8_t *cm = vpx_ff_cropTbl;
29   int32_t vector1b, vector2b, vector3b, vector4b;
30   int32_t Temp1, Temp2, Temp3, Temp4;
31   uint32_t vector4a = 64;
32   uint32_t tp1, tp2;
33   uint32_t p1, p2, p3, p4;
34   uint32_t n1, n2, n3, n4;
35   uint32_t tn1, tn2;
36 
37   vector1b = ((const int32_t *)filter_x0)[0];
38   vector2b = ((const int32_t *)filter_x0)[1];
39   vector3b = ((const int32_t *)filter_x0)[2];
40   vector4b = ((const int32_t *)filter_x0)[3];
41 
42   for (y = h; y--;) {
43     /* prefetch data to cache memory */
44     prefetch_load(src + src_stride);
45     prefetch_load(src + src_stride + 32);
46     prefetch_store(dst + dst_stride);
47 
48     __asm__ __volatile__ (
49         "ulw              %[tp1],      0(%[src])                      \n\t"
50         "ulw              %[tp2],      4(%[src])                      \n\t"
51 
52         /* even 1. pixel */
53         "mtlo             %[vector4a], $ac3                           \n\t"
54         "mthi             $zero,       $ac3                           \n\t"
55         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
56         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
57         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
58         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
59         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
60         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
61         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
62         "ulw              %[tn2],      8(%[src])                      \n\t"
63         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
64         "extp             %[Temp1],    $ac3,           31             \n\t"
65 
66         /* even 2. pixel */
67         "mtlo             %[vector4a], $ac2                           \n\t"
68         "mthi             $zero,       $ac2                           \n\t"
69         "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
70         "balign           %[tn1],      %[tn2],         3              \n\t"
71         "balign           %[tn2],      %[tp2],         3              \n\t"
72         "balign           %[tp2],      %[tp1],         3              \n\t"
73         "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
74         "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
75         "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
76         "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
77         "extp             %[Temp3],    $ac2,           31             \n\t"
78 
79         /* odd 1. pixel */
80         "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
81         "mtlo             %[vector4a], $ac3                           \n\t"
82         "mthi             $zero,       $ac3                           \n\t"
83         "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
84         "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
85         "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
86         "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
87         "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
88         "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
89         "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
90         "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
91         "extp             %[Temp2],    $ac3,           31             \n\t"
92 
93         /* odd 2. pixel */
94         "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
95         "mtlo             %[vector4a], $ac2                           \n\t"
96         "mthi             $zero,       $ac2                           \n\t"
97         "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
98         "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
99         "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
100         "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
101         "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
102         "extp             %[Temp4],    $ac2,           31             \n\t"
103 
104         /* clamp */
105         "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
106         "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
107 
108         /* store bytes */
109         "sb               %[tp1],      0(%[dst])                      \n\t"
110         "sb               %[tn1],      1(%[dst])                      \n\t"
111         "sb               %[tp2],      2(%[dst])                      \n\t"
112         "sb               %[n2],       3(%[dst])                      \n\t"
113 
114         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
115           [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
116           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
117           [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
118           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
119           [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
120         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
121           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
122           [vector4a] "r" (vector4a),
123           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
124     );
125 
126     /* Next row... */
127     src += src_stride;
128     dst += dst_stride;
129   }
130 }
131 
convolve_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)132 static void convolve_horiz_8_dspr2(const uint8_t *src,
133                                    int32_t src_stride,
134                                    uint8_t *dst,
135                                    int32_t dst_stride,
136                                    const int16_t *filter_x0,
137                                    int32_t h) {
138   int32_t y;
139   uint8_t *cm = vpx_ff_cropTbl;
140   uint32_t vector4a = 64;
141   int32_t vector1b, vector2b, vector3b, vector4b;
142   int32_t Temp1, Temp2, Temp3;
143   uint32_t tp1, tp2;
144   uint32_t p1, p2, p3, p4, n1;
145   uint32_t tn1, tn2, tn3;
146   uint32_t st0, st1;
147 
148   vector1b = ((const int32_t *)filter_x0)[0];
149   vector2b = ((const int32_t *)filter_x0)[1];
150   vector3b = ((const int32_t *)filter_x0)[2];
151   vector4b = ((const int32_t *)filter_x0)[3];
152 
153   for (y = h; y--;) {
154     /* prefetch data to cache memory */
155     prefetch_load(src + src_stride);
156     prefetch_load(src + src_stride + 32);
157     prefetch_store(dst + dst_stride);
158 
159     __asm__ __volatile__ (
160         "ulw              %[tp1],      0(%[src])                      \n\t"
161         "ulw              %[tp2],      4(%[src])                      \n\t"
162 
163         /* even 1. pixel */
164         "mtlo             %[vector4a], $ac3                           \n\t"
165         "mthi             $zero,       $ac3                           \n\t"
166         "mtlo             %[vector4a], $ac2                           \n\t"
167         "mthi             $zero,       $ac2                           \n\t"
168         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
169         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
170         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
171         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
172         "ulw              %[tn2],      8(%[src])                      \n\t"
173         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
174         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
175         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
176         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
177         "extp             %[Temp1],    $ac3,           31             \n\t"
178 
179         /* even 2. pixel */
180         "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
181         "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
182         "ulw              %[tn1],      12(%[src])                     \n\t"
183         "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
184         "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
185         "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
186         "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
187         "extp             %[Temp3],    $ac2,           31             \n\t"
188 
189         /* even 3. pixel */
190         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
191         "mtlo             %[vector4a], $ac1                           \n\t"
192         "mthi             $zero,       $ac1                           \n\t"
193         "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
194         "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
195         "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
196         "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
197         "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
198         "extp             %[Temp1],    $ac1,           31             \n\t"
199 
200         /* even 4. pixel */
201         "mtlo             %[vector4a], $ac2                           \n\t"
202         "mthi             $zero,       $ac2                           \n\t"
203         "mtlo             %[vector4a], $ac3                           \n\t"
204         "mthi             $zero,       $ac3                           \n\t"
205         "sb               %[st0],      0(%[dst])                      \n\t"
206         "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
207 
208         "balign           %[tn3],      %[tn1],         3              \n\t"
209         "balign           %[tn1],      %[tn2],         3              \n\t"
210         "balign           %[tn2],      %[tp2],         3              \n\t"
211         "balign           %[tp2],      %[tp1],         3              \n\t"
212 
213         "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
214         "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
215         "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
216         "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
217         "extp             %[Temp3],    $ac2,           31             \n\t"
218 
219         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
220 
221         /* odd 1. pixel */
222         "mtlo             %[vector4a], $ac1                           \n\t"
223         "mthi             $zero,       $ac1                           \n\t"
224         "sb               %[st1],      2(%[dst])                      \n\t"
225         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
226         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
227         "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
228         "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
229         "sb               %[st0],      4(%[dst])                      \n\t"
230         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
231         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
232         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
233         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
234         "extp             %[Temp2],    $ac3,           31             \n\t"
235 
236         /* odd 2. pixel */
237         "mtlo             %[vector4a], $ac3                           \n\t"
238         "mthi             $zero,       $ac3                           \n\t"
239         "mtlo             %[vector4a], $ac2                           \n\t"
240         "mthi             $zero,       $ac2                           \n\t"
241         "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
242         "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
243         "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
244         "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
245         "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
246         "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
247         "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
248         "extp             %[Temp3],    $ac1,           31             \n\t"
249 
250         /* odd 3. pixel */
251         "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
252         "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
253         "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
254         "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
255         "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
256         "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
257         "extp             %[Temp2],    $ac3,           31             \n\t"
258 
259         /* odd 4. pixel */
260         "sb               %[st1],      1(%[dst])                      \n\t"
261         "sb               %[st0],      6(%[dst])                      \n\t"
262         "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
263         "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
264         "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
265         "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
266         "extp             %[Temp1],    $ac2,           31             \n\t"
267 
268         /* clamp */
269         "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
270         "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
271         "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
272 
273         /* store bytes */
274         "sb               %[p4],       3(%[dst])                      \n\t"
275         "sb               %[p2],       5(%[dst])                      \n\t"
276         "sb               %[n1],       7(%[dst])                      \n\t"
277 
278         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
279           [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
280           [st0] "=&r" (st0), [st1] "=&r" (st1),
281           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
282           [n1] "=&r" (n1),
283           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
284         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
285           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
286           [vector4a] "r" (vector4a),
287           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
288     );
289 
290     /* Next row... */
291     src += src_stride;
292     dst += dst_stride;
293   }
294 }
295 
convolve_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)296 static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
297                                     int32_t src_stride,
298                                     uint8_t *dst_ptr,
299                                     int32_t dst_stride,
300                                     const int16_t *filter_x0,
301                                     int32_t h,
302                                     int32_t count) {
303   int32_t y, c;
304   const uint8_t *src;
305   uint8_t *dst;
306   uint8_t *cm = vpx_ff_cropTbl;
307   uint32_t vector_64 = 64;
308   int32_t filter12, filter34, filter56, filter78;
309   int32_t Temp1, Temp2, Temp3;
310   uint32_t qload1, qload2, qload3;
311   uint32_t p1, p2, p3, p4, p5;
312   uint32_t st1, st2, st3;
313 
314   filter12 = ((const int32_t *)filter_x0)[0];
315   filter34 = ((const int32_t *)filter_x0)[1];
316   filter56 = ((const int32_t *)filter_x0)[2];
317   filter78 = ((const int32_t *)filter_x0)[3];
318 
319   for (y = h; y--;) {
320     src = src_ptr;
321     dst = dst_ptr;
322 
323     /* prefetch data to cache memory */
324     prefetch_load(src_ptr + src_stride);
325     prefetch_load(src_ptr + src_stride + 32);
326     prefetch_store(dst_ptr + dst_stride);
327 
328     for (c = 0; c < count; c++) {
329       __asm__ __volatile__ (
330           "ulw              %[qload1],    0(%[src])                    \n\t"
331           "ulw              %[qload2],    4(%[src])                    \n\t"
332 
333           /* even 1. pixel */
334           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
335           "mthi             $zero,        $ac1                         \n\t"
336           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
337           "mthi             $zero,        $ac2                         \n\t"
338           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
339           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
340           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
341           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
342           "ulw              %[qload3],    8(%[src])                    \n\t"
343           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
344           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
345           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
346           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
347           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
348 
349           /* even 2. pixel */
350           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
351           "mthi             $zero,        $ac3                         \n\t"
352           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
353           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
354           "ulw              %[qload1],    12(%[src])                   \n\t"
355           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
356           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
357           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
358           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
359           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
360           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
361 
362           /* even 3. pixel */
363           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
364           "mthi             $zero,        $ac1                         \n\t"
365           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
366           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
367           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
368           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
369           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
370           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
371           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
372           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
373 
374           /* even 4. pixel */
375           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
376           "mthi             $zero,        $ac2                         \n\t"
377           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
378           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
379           "ulw              %[qload2],    16(%[src])                   \n\t"
380           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
381           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
382           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
383           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
384           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
385           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
386 
387           /* even 5. pixel */
388           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
389           "mthi             $zero,        $ac3                         \n\t"
390           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
391           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
392           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
393           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
394           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
395           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
396           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
397           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
398 
399           /* even 6. pixel */
400           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
401           "mthi             $zero,        $ac1                         \n\t"
402           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
403           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
404           "ulw              %[qload3],    20(%[src])                   \n\t"
405           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
406           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
407           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
408           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
409           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
410           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
411 
412           /* even 7. pixel */
413           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
414           "mthi             $zero,        $ac2                         \n\t"
415           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
416           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
417           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
418           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
419           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
420           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
421           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
422           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
423 
424           /* even 8. pixel */
425           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
426           "mthi             $zero,        $ac3                         \n\t"
427           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
428           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
429           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
430           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
431           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
432           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
433           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
434 
435           /* ODD pixels */
436           "ulw              %[qload1],    1(%[src])                    \n\t"
437           "ulw              %[qload2],    5(%[src])                    \n\t"
438 
439           /* odd 1. pixel */
440           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
441           "mthi             $zero,        $ac1                         \n\t"
442           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
443           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
444           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
445           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
446           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
447           "ulw              %[qload3],    9(%[src])                    \n\t"
448           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
449           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
450           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
451           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
452           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
453           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
454 
455           /* odd 2. pixel */
456           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
457           "mthi             $zero,        $ac2                         \n\t"
458           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
459           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
460           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
461           "ulw              %[qload1],    13(%[src])                   \n\t"
462           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
463           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
464           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
465           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
466           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
467           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
468 
469           /* odd 3. pixel */
470           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
471           "mthi             $zero,        $ac3                         \n\t"
472           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
473           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
474           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
475           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
476           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
477           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
478           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
479           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
480 
481           /* odd 4. pixel */
482           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
483           "mthi             $zero,        $ac1                         \n\t"
484           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
485           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
486           "ulw              %[qload2],    17(%[src])                   \n\t"
487           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
488           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
489           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
490           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
491           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
492           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
493 
494           /* odd 5. pixel */
495           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
496           "mthi             $zero,        $ac2                         \n\t"
497           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
498           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
499           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
500           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
501           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
502           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
503           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
504           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
505 
506           /* odd 6. pixel */
507           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
508           "mthi             $zero,        $ac3                         \n\t"
509           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
510           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
511           "ulw              %[qload3],    21(%[src])                   \n\t"
512           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
513           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
514           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
515           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
516           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
517           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
518 
519           /* odd 7. pixel */
520           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
521           "mthi             $zero,        $ac1                         \n\t"
522           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
523           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
524           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
525           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
526           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
527           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
528           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
529 
530           /* odd 8. pixel */
531           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
532           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
533           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
534           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
535           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
536 
537           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
538           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
539           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
540 
541           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
542           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
543           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
544 
545           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
546             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
547             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
548             [p5] "=&r" (p5),
549             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
550           : [filter12] "r" (filter12), [filter34] "r" (filter34),
551             [filter56] "r" (filter56), [filter78] "r" (filter78),
552             [vector_64] "r" (vector_64),
553             [cm] "r" (cm), [dst] "r" (dst),
554             [src] "r" (src)
555       );
556 
557       src += 16;
558       dst += 16;
559     }
560 
561     /* Next row... */
562     src_ptr += src_stride;
563     dst_ptr += dst_stride;
564   }
565 }
566 
convolve_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)567 static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
568                                     int32_t src_stride,
569                                     uint8_t *dst_ptr,
570                                     int32_t dst_stride,
571                                     const int16_t *filter_x0,
572                                     int32_t h) {
573   int32_t y, c;
574   const uint8_t *src;
575   uint8_t *dst;
576   uint8_t *cm = vpx_ff_cropTbl;
577   uint32_t vector_64 = 64;
578   int32_t filter12, filter34, filter56, filter78;
579   int32_t Temp1, Temp2, Temp3;
580   uint32_t qload1, qload2, qload3;
581   uint32_t p1, p2, p3, p4, p5;
582   uint32_t st1, st2, st3;
583 
584   filter12 = ((const int32_t *)filter_x0)[0];
585   filter34 = ((const int32_t *)filter_x0)[1];
586   filter56 = ((const int32_t *)filter_x0)[2];
587   filter78 = ((const int32_t *)filter_x0)[3];
588 
589   for (y = h; y--;) {
590     src = src_ptr;
591     dst = dst_ptr;
592 
593     /* prefetch data to cache memory */
594     prefetch_load(src_ptr + src_stride);
595     prefetch_load(src_ptr + src_stride + 32);
596     prefetch_load(src_ptr + src_stride + 64);
597     prefetch_store(dst_ptr + dst_stride);
598     prefetch_store(dst_ptr + dst_stride + 32);
599 
600     for (c = 0; c < 4; c++) {
601       __asm__ __volatile__ (
602           "ulw              %[qload1],    0(%[src])                    \n\t"
603           "ulw              %[qload2],    4(%[src])                    \n\t"
604 
605           /* even 1. pixel */
606           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
607           "mthi             $zero,        $ac1                         \n\t"
608           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
609           "mthi             $zero,        $ac2                         \n\t"
610           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
611           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
612           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
613           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
614           "ulw              %[qload3],    8(%[src])                    \n\t"
615           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
616           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
617           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
618           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
619           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
620 
621           /* even 2. pixel */
622           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
623           "mthi             $zero,        $ac3                         \n\t"
624           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
625           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
626           "ulw              %[qload1],    12(%[src])                   \n\t"
627           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
628           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
629           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
630           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
631           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
632           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
633 
634           /* even 3. pixel */
635           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
636           "mthi             $zero,        $ac1                         \n\t"
637           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
638           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
639           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
640           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
641           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
642           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
643           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
644           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
645 
646           /* even 4. pixel */
647           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
648           "mthi             $zero,        $ac2                         \n\t"
649           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
650           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
651           "ulw              %[qload2],    16(%[src])                   \n\t"
652           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
653           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
654           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
655           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
656           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
657           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
658 
659           /* even 5. pixel */
660           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
661           "mthi             $zero,        $ac3                         \n\t"
662           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
663           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
664           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
665           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
666           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
667           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
668           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
669           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
670 
671           /* even 6. pixel */
672           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
673           "mthi             $zero,        $ac1                         \n\t"
674           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
675           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
676           "ulw              %[qload3],    20(%[src])                   \n\t"
677           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
678           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
679           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
680           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
681           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
682           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
683 
684           /* even 7. pixel */
685           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
686           "mthi             $zero,        $ac2                         \n\t"
687           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
688           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
689           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
690           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
691           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
692           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
693           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
694           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
695 
696           /* even 8. pixel */
697           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
698           "mthi             $zero,        $ac3                         \n\t"
699           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
700           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
701           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
702           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
703           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
704           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
705           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
706 
707           /* ODD pixels */
708           "ulw              %[qload1],    1(%[src])                    \n\t"
709           "ulw              %[qload2],    5(%[src])                    \n\t"
710 
711           /* odd 1. pixel */
712           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
713           "mthi             $zero,        $ac1                         \n\t"
714           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
715           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
716           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
717           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
718           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
719           "ulw              %[qload3],    9(%[src])                    \n\t"
720           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
721           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
722           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
723           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
724           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
725           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
726 
727           /* odd 2. pixel */
728           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
729           "mthi             $zero,        $ac2                         \n\t"
730           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
731           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
732           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
733           "ulw              %[qload1],    13(%[src])                   \n\t"
734           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
735           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
736           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
737           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
738           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
739           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
740 
741           /* odd 3. pixel */
742           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
743           "mthi             $zero,        $ac3                         \n\t"
744           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
745           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
746           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
747           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
748           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
749           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
750           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
751           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
752 
753           /* odd 4. pixel */
754           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
755           "mthi             $zero,        $ac1                         \n\t"
756           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
757           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
758           "ulw              %[qload2],    17(%[src])                   \n\t"
759           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
760           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
761           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
762           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
763           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
764           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
765 
766           /* odd 5. pixel */
767           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
768           "mthi             $zero,        $ac2                         \n\t"
769           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
770           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
771           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
772           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
773           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
774           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
775           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
776           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
777 
778           /* odd 6. pixel */
779           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
780           "mthi             $zero,        $ac3                         \n\t"
781           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
782           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
783           "ulw              %[qload3],    21(%[src])                   \n\t"
784           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
785           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
786           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
787           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
788           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
789           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
790 
791           /* odd 7. pixel */
792           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
793           "mthi             $zero,        $ac1                         \n\t"
794           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
795           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
796           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
797           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
798           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
799           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
800           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
801 
802           /* odd 8. pixel */
803           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
804           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
805           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
806           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
807           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
808 
809           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
810           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
811           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
812 
813           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
814           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
815           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
816 
817           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
818             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
819             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
820             [p5] "=&r" (p5),
821             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
822           : [filter12] "r" (filter12), [filter34] "r" (filter34),
823             [filter56] "r" (filter56), [filter78] "r" (filter78),
824             [vector_64] "r" (vector_64),
825             [cm] "r" (cm), [dst] "r" (dst),
826             [src] "r" (src)
827       );
828 
829       src += 16;
830       dst += 16;
831     }
832 
833     /* Next row... */
834     src_ptr += src_stride;
835     dst_ptr += dst_stride;
836   }
837 }
838 
vpx_convolve8_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)839 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
840                                uint8_t *dst, ptrdiff_t dst_stride,
841                                const int16_t *filter_x, int x_step_q4,
842                                const int16_t *filter_y, int y_step_q4,
843                                int w, int h) {
844   assert(x_step_q4 == 16);
845   assert(((const int32_t *)filter_x)[1] != 0x800000);
846 
847   if (((const int32_t *)filter_x)[0] == 0) {
848     vpx_convolve2_horiz_dspr2(src, src_stride,
849                               dst, dst_stride,
850                               filter_x, x_step_q4,
851                               filter_y, y_step_q4,
852                               w, h);
853   } else {
854     uint32_t pos = 38;
855 
856     prefetch_load((const uint8_t *)filter_x);
857     src -= 3;
858 
859     /* bit positon for extract from acc */
860     __asm__ __volatile__ (
861       "wrdsp      %[pos],     1           \n\t"
862       :
863       : [pos] "r" (pos)
864     );
865 
866     /* prefetch data to cache memory */
867     prefetch_load(src);
868     prefetch_load(src + 32);
869     prefetch_store(dst);
870 
871     switch (w) {
872       case 4:
873         convolve_horiz_4_dspr2(src, (int32_t)src_stride,
874                                dst, (int32_t)dst_stride,
875                                filter_x, (int32_t)h);
876         break;
877       case 8:
878         convolve_horiz_8_dspr2(src, (int32_t)src_stride,
879                                dst, (int32_t)dst_stride,
880                                filter_x, (int32_t)h);
881         break;
882       case 16:
883         convolve_horiz_16_dspr2(src, (int32_t)src_stride,
884                                 dst, (int32_t)dst_stride,
885                                 filter_x, (int32_t)h, 1);
886         break;
887       case 32:
888         convolve_horiz_16_dspr2(src, (int32_t)src_stride,
889                                 dst, (int32_t)dst_stride,
890                                 filter_x, (int32_t)h, 2);
891         break;
892       case 64:
893         prefetch_load(src + 64);
894         prefetch_store(dst + 32);
895 
896         convolve_horiz_64_dspr2(src, (int32_t)src_stride,
897                                 dst, (int32_t)dst_stride,
898                                 filter_x, (int32_t)h);
899         break;
900       default:
901         vpx_convolve8_horiz_c(src + 3, src_stride,
902                               dst, dst_stride,
903                               filter_x, x_step_q4,
904                               filter_y, y_step_q4,
905                               w, h);
906         break;
907     }
908   }
909 }
910 #endif
911