• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <stdlib.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/mips/common_dspr2.h"
16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19 #include "vpx_mem/vpx_mem.h"
20 
21 #if HAVE_DSPR2
vpx_lpf_horizontal_8_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)22 void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
23                                 int pitch,
24                                 const uint8_t *blimit,
25                                 const uint8_t *limit,
26                                 const uint8_t *thresh,
27                                 int count) {
28   uint32_t  mask;
29   uint32_t  hev, flat;
30   uint8_t   i;
31   uint8_t   *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
32   uint32_t  thresh_vec, flimit_vec, limit_vec;
33   uint32_t  uflimit, ulimit, uthresh;
34   uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
35   uint32_t  p3, p2, p1, p0, q0, q1, q2, q3;
36   uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
37   uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
38 
39   uflimit = *blimit;
40   ulimit  = *limit;
41   uthresh = *thresh;
42 
43   /* create quad-byte */
44   __asm__ __volatile__ (
45       "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
46       "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
47       "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
48 
49       : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
50         [limit_vec] "=r" (limit_vec)
51       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
52   );
53 
54   /* prefetch data for store */
55   prefetch_store(s);
56 
57   for (i = 0; i < 2; i++) {
58     sp3 = s - (pitch << 2);
59     sp2 = sp3 + pitch;
60     sp1 = sp2 + pitch;
61     sp0 = sp1 + pitch;
62     sq0 = s;
63     sq1 = s + pitch;
64     sq2 = sq1 + pitch;
65     sq3 = sq2 + pitch;
66 
67     __asm__ __volatile__ (
68         "lw     %[p3],      (%[sp3])    \n\t"
69         "lw     %[p2],      (%[sp2])    \n\t"
70         "lw     %[p1],      (%[sp1])    \n\t"
71         "lw     %[p0],      (%[sp0])    \n\t"
72         "lw     %[q0],      (%[sq0])    \n\t"
73         "lw     %[q1],      (%[sq1])    \n\t"
74         "lw     %[q2],      (%[sq2])    \n\t"
75         "lw     %[q3],      (%[sq3])    \n\t"
76 
77         : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
78           [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0)
79         : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
80           [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
81     );
82 
83     filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
84                                     p1, p0, p3, p2, q0, q1, q2, q3,
85                                     &hev, &mask, &flat);
86 
87     if ((flat == 0) && (mask != 0)) {
88       filter1_dspr2(mask, hev, p1, p0, q0, q1,
89                     &p1_f0, &p0_f0, &q0_f0, &q1_f0);
90 
91       __asm__ __volatile__ (
92           "sw       %[p1_f0],   (%[sp1])    \n\t"
93           "sw       %[p0_f0],   (%[sp0])    \n\t"
94           "sw       %[q0_f0],   (%[sq0])    \n\t"
95           "sw       %[q1_f0],   (%[sq1])    \n\t"
96 
97           :
98           : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
99             [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
100             [sp1] "r" (sp1), [sp0] "r" (sp0),
101             [sq0] "r" (sq0), [sq1] "r" (sq1)
102       );
103     } else if ((mask & flat) == 0xFFFFFFFF) {
104       /* left 2 element operation */
105       PACK_LEFT_0TO3()
106       mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
107                      &q0_l, &q1_l, &q2_l, &q3_l);
108 
109       /* right 2 element operation */
110       PACK_RIGHT_0TO3()
111       mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
112                      &q0_r, &q1_r, &q2_r, &q3_r);
113 
114       COMBINE_LEFT_RIGHT_0TO2()
115 
116       __asm__ __volatile__ (
117           "sw       %[p2],      (%[sp2])    \n\t"
118           "sw       %[p1],      (%[sp1])    \n\t"
119           "sw       %[p0],      (%[sp0])    \n\t"
120           "sw       %[q0],      (%[sq0])    \n\t"
121           "sw       %[q1],      (%[sq1])    \n\t"
122           "sw       %[q2],      (%[sq2])    \n\t"
123 
124           :
125           : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
126             [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
127             [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
128             [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
129       );
130     } else if ((flat != 0) && (mask != 0)) {
131       /* filtering */
132       filter1_dspr2(mask, hev, p1, p0, q0, q1,
133                     &p1_f0, &p0_f0, &q0_f0, &q1_f0);
134 
135       /* left 2 element operation */
136       PACK_LEFT_0TO3()
137       mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
138                      &q0_l, &q1_l, &q2_l, &q3_l);
139 
140       /* right 2 element operation */
141       PACK_RIGHT_0TO3()
142       mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
143                      &q0_r, &q1_r, &q2_r, &q3_r);
144 
145       if (mask & flat & 0x000000FF) {
146         __asm__ __volatile__ (
147             "sb     %[p2_r],    (%[sp2])    \n\t"
148             "sb     %[p1_r],    (%[sp1])    \n\t"
149             "sb     %[p0_r],    (%[sp0])    \n\t"
150             "sb     %[q0_r],    (%[sq0])    \n\t"
151             "sb     %[q1_r],    (%[sq1])    \n\t"
152             "sb     %[q2_r],    (%[sq2])    \n\t"
153 
154             :
155             : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
156               [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
157               [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
158               [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
159         );
160       } else if (mask & 0x000000FF) {
161         __asm__ __volatile__ (
162             "sb         %[p1_f0],  (%[sp1])    \n\t"
163             "sb         %[p0_f0],  (%[sp0])    \n\t"
164             "sb         %[q0_f0],  (%[sq0])    \n\t"
165             "sb         %[q1_f0],  (%[sq1])    \n\t"
166 
167             :
168             : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
169               [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
170               [sp1] "r" (sp1), [sp0] "r" (sp0),
171               [sq0] "r" (sq0), [sq1] "r" (sq1)
172         );
173       }
174 
175       __asm__ __volatile__ (
176           "srl      %[p2_r],    %[p2_r],    16      \n\t"
177           "srl      %[p1_r],    %[p1_r],    16      \n\t"
178           "srl      %[p0_r],    %[p0_r],    16      \n\t"
179           "srl      %[q0_r],    %[q0_r],    16      \n\t"
180           "srl      %[q1_r],    %[q1_r],    16      \n\t"
181           "srl      %[q2_r],    %[q2_r],    16      \n\t"
182           "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
183           "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
184           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
185           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
186 
187           : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
188             [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
189             [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
190             [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
191           :
192       );
193 
194       if (mask & flat & 0x0000FF00) {
195         __asm__ __volatile__ (
196             "sb     %[p2_r],    +1(%[sp2])    \n\t"
197             "sb     %[p1_r],    +1(%[sp1])    \n\t"
198             "sb     %[p0_r],    +1(%[sp0])    \n\t"
199             "sb     %[q0_r],    +1(%[sq0])    \n\t"
200             "sb     %[q1_r],    +1(%[sq1])    \n\t"
201             "sb     %[q2_r],    +1(%[sq2])    \n\t"
202 
203             :
204             : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
205               [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
206               [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
207               [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
208         );
209       } else if (mask & 0x0000FF00) {
210         __asm__ __volatile__ (
211             "sb     %[p1_f0],   +1(%[sp1])    \n\t"
212             "sb     %[p0_f0],   +1(%[sp0])    \n\t"
213             "sb     %[q0_f0],   +1(%[sq0])    \n\t"
214             "sb     %[q1_f0],   +1(%[sq1])    \n\t"
215 
216             :
217             : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
218               [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
219               [sp1] "r" (sp1), [sp0] "r" (sp0),
220               [sq0] "r" (sq0), [sq1] "r" (sq1)
221         );
222       }
223 
224       __asm__ __volatile__ (
225           "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
226           "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
227           "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
228           "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
229 
230           : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
231             [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
232             [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
233             [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
234           :
235       );
236 
237       if (mask & flat & 0x00FF0000) {
238         __asm__ __volatile__ (
239             "sb     %[p2_l],    +2(%[sp2])    \n\t"
240             "sb     %[p1_l],    +2(%[sp1])    \n\t"
241             "sb     %[p0_l],    +2(%[sp0])    \n\t"
242             "sb     %[q0_l],    +2(%[sq0])    \n\t"
243             "sb     %[q1_l],    +2(%[sq1])    \n\t"
244             "sb     %[q2_l],    +2(%[sq2])    \n\t"
245 
246             :
247             : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
248               [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
249               [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
250               [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
251         );
252       } else if (mask & 0x00FF0000) {
253         __asm__ __volatile__ (
254             "sb     %[p1_f0],   +2(%[sp1])    \n\t"
255             "sb     %[p0_f0],   +2(%[sp0])    \n\t"
256             "sb     %[q0_f0],   +2(%[sq0])    \n\t"
257             "sb     %[q1_f0],   +2(%[sq1])    \n\t"
258 
259             :
260             : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
261               [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
262               [sp1] "r" (sp1), [sp0] "r" (sp0),
263               [sq0] "r" (sq0), [sq1] "r" (sq1)
264         );
265       }
266 
267       __asm__ __volatile__ (
268           "srl      %[p2_l],    %[p2_l],    16      \n\t"
269           "srl      %[p1_l],    %[p1_l],    16      \n\t"
270           "srl      %[p0_l],    %[p0_l],    16      \n\t"
271           "srl      %[q0_l],    %[q0_l],    16      \n\t"
272           "srl      %[q1_l],    %[q1_l],    16      \n\t"
273           "srl      %[q2_l],    %[q2_l],    16      \n\t"
274           "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
275           "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
276           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
277           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
278 
279           : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
280             [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
281             [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
282             [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
283           :
284       );
285 
286       if (mask & flat & 0xFF000000) {
287         __asm__ __volatile__ (
288             "sb     %[p2_l],    +3(%[sp2])    \n\t"
289             "sb     %[p1_l],    +3(%[sp1])    \n\t"
290             "sb     %[p0_l],    +3(%[sp0])    \n\t"
291             "sb     %[q0_l],    +3(%[sq0])    \n\t"
292             "sb     %[q1_l],    +3(%[sq1])    \n\t"
293             "sb     %[q2_l],    +3(%[sq2])    \n\t"
294 
295             :
296             : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
297               [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
298               [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
299               [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
300         );
301       } else if (mask & 0xFF000000) {
302         __asm__ __volatile__ (
303             "sb     %[p1_f0],   +3(%[sp1])    \n\t"
304             "sb     %[p0_f0],   +3(%[sp0])    \n\t"
305             "sb     %[q0_f0],   +3(%[sq0])    \n\t"
306             "sb     %[q1_f0],   +3(%[sq1])    \n\t"
307 
308             :
309             : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
310               [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
311               [sp1] "r" (sp1), [sp0] "r" (sp0),
312               [sq0] "r" (sq0), [sq1] "r" (sq1)
313         );
314       }
315     }
316 
317     s = s + 4;
318   }
319 }
320 
vpx_lpf_vertical_8_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)321 void vpx_lpf_vertical_8_dspr2(unsigned char *s,
322                               int pitch,
323                               const uint8_t *blimit,
324                               const uint8_t *limit,
325                               const uint8_t *thresh,
326                               int count) {
327   uint8_t   i;
328   uint32_t  mask, hev, flat;
329   uint8_t   *s1, *s2, *s3, *s4;
330   uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
331   uint32_t  thresh_vec, flimit_vec, limit_vec;
332   uint32_t  uflimit, ulimit, uthresh;
333   uint32_t  p3, p2, p1, p0, q3, q2, q1, q0;
334   uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
335   uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
336   uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
337 
338   uflimit = *blimit;
339   ulimit  = *limit;
340   uthresh = *thresh;
341 
342   /* create quad-byte */
343   __asm__ __volatile__ (
344       "replv.qb     %[thresh_vec],  %[uthresh]    \n\t"
345       "replv.qb     %[flimit_vec],  %[uflimit]    \n\t"
346       "replv.qb     %[limit_vec],   %[ulimit]     \n\t"
347 
348       : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
349         [limit_vec] "=r" (limit_vec)
350       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
351   );
352 
353   prefetch_store(s + pitch);
354 
355   for (i = 0; i < 2; i++) {
356     s1 = s;
357     s2 = s + pitch;
358     s3 = s2 + pitch;
359     s4 = s3 + pitch;
360     s  = s4 + pitch;
361 
362     __asm__ __volatile__ (
363         "lw     %[p0],  -4(%[s1])    \n\t"
364         "lw     %[p1],  -4(%[s2])    \n\t"
365         "lw     %[p2],  -4(%[s3])    \n\t"
366         "lw     %[p3],  -4(%[s4])    \n\t"
367         "lw     %[q3],    (%[s1])    \n\t"
368         "lw     %[q2],    (%[s2])    \n\t"
369         "lw     %[q1],    (%[s3])    \n\t"
370         "lw     %[q0],    (%[s4])    \n\t"
371 
372         : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
373           [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3)
374         : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
375     );
376 
377     /* transpose p3, p2, p1, p0
378        original (when loaded from memory)
379        register       -4    -3   -2     -1
380          p0         p0_0  p0_1  p0_2  p0_3
381          p1         p1_0  p1_1  p1_2  p1_3
382          p2         p2_0  p2_1  p2_2  p2_3
383          p3         p3_0  p3_1  p3_2  p3_3
384 
385        after transpose
386        register
387          p0         p3_3  p2_3  p1_3  p0_3
388          p1         p3_2  p2_2  p1_2  p0_2
389          p2         p3_1  p2_1  p1_1  p0_1
390          p3         p3_0  p2_0  p1_0  p0_0
391     */
392     __asm__ __volatile__ (
393         "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
394         "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
395         "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
396         "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
397 
398         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
399         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
400         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
401         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
402 
403         "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
404         "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
405         "append         %[p1],      %[sec3],    16          \n\t"
406         "append         %[p3],      %[sec4],    16          \n\t"
407 
408         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
409           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
410           [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
411           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
412         :
413     );
414 
415     /* transpose q0, q1, q2, q3
416        original (when loaded from memory)
417        register       +1    +2    +3    +4
418          q3         q3_0  q3_1  q3_2  q3_3
419          q2         q2_0  q2_1  q2_2  q2_3
420          q1         q1_0  q1_1  q1_2  q1_3
421          q0         q0_0  q0_1  q0_2  q0_3
422 
423        after transpose
424        register
425          q3         q0_3  q1_3  q2_3  q3_3
426          q2         q0_2  q1_2  q2_2  q3_2
427          q1         q0_1  q1_1  q2_1  q3_1
428          q0         q0_0  q1_0  q2_0  q3_0
429     */
430     __asm__ __volatile__ (
431         "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
432         "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
433         "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
434         "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
435 
436         "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
437         "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
438         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
439         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
440 
441         "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
442         "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
443         "append         %[q2],      %[sec3],    16          \n\t"
444         "append         %[q0],      %[sec4],    16          \n\t"
445 
446         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
447           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
448           [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
449           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
450         :
451     );
452 
453     filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
454                                     p1, p0, p3, p2, q0, q1, q2, q3,
455                                     &hev, &mask, &flat);
456 
457     if ((flat == 0) && (mask != 0)) {
458       filter1_dspr2(mask, hev, p1, p0, q0, q1,
459                     &p1_f0, &p0_f0, &q0_f0, &q1_f0);
460       STORE_F0()
461     } else if ((mask & flat) == 0xFFFFFFFF) {
462       /* left 2 element operation */
463       PACK_LEFT_0TO3()
464       mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
465                      &q0_l, &q1_l, &q2_l, &q3_l);
466 
467       /* right 2 element operation */
468       PACK_RIGHT_0TO3()
469       mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
470                      &q0_r, &q1_r, &q2_r, &q3_r);
471 
472       STORE_F1()
473     } else if ((flat != 0) && (mask != 0)) {
474       filter1_dspr2(mask, hev, p1, p0, q0, q1,
475                     &p1_f0, &p0_f0, &q0_f0, &q1_f0);
476 
477       /* left 2 element operation */
478       PACK_LEFT_0TO3()
479       mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
480                      &q0_l, &q1_l, &q2_l, &q3_l);
481 
482       /* right 2 element operation */
483       PACK_RIGHT_0TO3()
484       mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
485                      &q0_r, &q1_r, &q2_r, &q3_r);
486 
487       if (mask & flat & 0x000000FF) {
488         __asm__ __volatile__ (
489             "sb         %[p2_r],  -3(%[s4])    \n\t"
490             "sb         %[p1_r],  -2(%[s4])    \n\t"
491             "sb         %[p0_r],  -1(%[s4])    \n\t"
492             "sb         %[q0_r],    (%[s4])    \n\t"
493             "sb         %[q1_r],  +1(%[s4])    \n\t"
494             "sb         %[q2_r],  +2(%[s4])    \n\t"
495 
496             :
497             : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
498               [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
499               [s4] "r" (s4)
500         );
501       } else if (mask & 0x000000FF) {
502         __asm__ __volatile__ (
503             "sb         %[p1_f0],  -2(%[s4])    \n\t"
504             "sb         %[p0_f0],  -1(%[s4])    \n\t"
505             "sb         %[q0_f0],    (%[s4])    \n\t"
506             "sb         %[q1_f0],  +1(%[s4])    \n\t"
507 
508             :
509             : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
510               [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
511               [s4] "r" (s4)
512         );
513       }
514 
515       __asm__ __volatile__ (
516           "srl      %[p2_r],    %[p2_r],    16      \n\t"
517           "srl      %[p1_r],    %[p1_r],    16      \n\t"
518           "srl      %[p0_r],    %[p0_r],    16      \n\t"
519           "srl      %[q0_r],    %[q0_r],    16      \n\t"
520           "srl      %[q1_r],    %[q1_r],    16      \n\t"
521           "srl      %[q2_r],    %[q2_r],    16      \n\t"
522           "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
523           "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
524           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
525           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
526 
527           : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
528             [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
529             [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
530             [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
531           :
532       );
533 
534       if (mask & flat & 0x0000FF00) {
535         __asm__ __volatile__ (
536             "sb         %[p2_r],  -3(%[s3])    \n\t"
537             "sb         %[p1_r],  -2(%[s3])    \n\t"
538             "sb         %[p0_r],  -1(%[s3])    \n\t"
539             "sb         %[q0_r],    (%[s3])    \n\t"
540             "sb         %[q1_r],  +1(%[s3])    \n\t"
541             "sb         %[q2_r],  +2(%[s3])    \n\t"
542 
543             :
544             : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
545               [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
546               [s3] "r" (s3)
547         );
548       } else if (mask & 0x0000FF00) {
549         __asm__ __volatile__ (
550             "sb         %[p1_f0],  -2(%[s3])    \n\t"
551             "sb         %[p0_f0],  -1(%[s3])    \n\t"
552             "sb         %[q0_f0],    (%[s3])    \n\t"
553             "sb         %[q1_f0],  +1(%[s3])    \n\t"
554 
555             :
556             : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
557               [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
558               [s3] "r" (s3)
559         );
560       }
561 
562       __asm__ __volatile__ (
563           "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
564           "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
565           "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
566           "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
567 
568           : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
569             [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
570             [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
571             [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
572           :
573       );
574 
575       if (mask & flat & 0x00FF0000) {
576         __asm__ __volatile__ (
577           "sb         %[p2_l],  -3(%[s2])    \n\t"
578           "sb         %[p1_l],  -2(%[s2])    \n\t"
579           "sb         %[p0_l],  -1(%[s2])    \n\t"
580           "sb         %[q0_l],    (%[s2])    \n\t"
581           "sb         %[q1_l],  +1(%[s2])    \n\t"
582           "sb         %[q2_l],  +2(%[s2])    \n\t"
583 
584           :
585           : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
586             [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
587             [s2] "r" (s2)
588         );
589       } else if (mask & 0x00FF0000) {
590         __asm__ __volatile__ (
591             "sb         %[p1_f0],  -2(%[s2])    \n\t"
592             "sb         %[p0_f0],  -1(%[s2])    \n\t"
593             "sb         %[q0_f0],    (%[s2])    \n\t"
594             "sb         %[q1_f0],  +1(%[s2])    \n\t"
595 
596             :
597             : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
598               [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
599               [s2] "r" (s2)
600         );
601       }
602 
603       __asm__ __volatile__ (
604           "srl      %[p2_l],    %[p2_l],    16      \n\t"
605           "srl      %[p1_l],    %[p1_l],    16      \n\t"
606           "srl      %[p0_l],    %[p0_l],    16      \n\t"
607           "srl      %[q0_l],    %[q0_l],    16      \n\t"
608           "srl      %[q1_l],    %[q1_l],    16      \n\t"
609           "srl      %[q2_l],    %[q2_l],    16      \n\t"
610           "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
611           "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
612           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
613           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
614 
615           : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
616             [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
617             [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
618             [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
619           :
620       );
621 
622       if (mask & flat & 0xFF000000) {
623         __asm__ __volatile__ (
624             "sb         %[p2_l],  -3(%[s1])    \n\t"
625             "sb         %[p1_l],  -2(%[s1])    \n\t"
626             "sb         %[p0_l],  -1(%[s1])    \n\t"
627             "sb         %[q0_l],    (%[s1])    \n\t"
628             "sb         %[q1_l],  +1(%[s1])    \n\t"
629             "sb         %[q2_l],  +2(%[s1])    \n\t"
630 
631             :
632             : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
633               [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
634               [s1] "r" (s1)
635         );
636       } else if (mask & 0xFF000000) {
637         __asm__ __volatile__ (
638             "sb         %[p1_f0],  -2(%[s1])    \n\t"
639             "sb         %[p0_f0],  -1(%[s1])    \n\t"
640             "sb         %[q0_f0],    (%[s1])    \n\t"
641             "sb         %[q1_f0],  +1(%[s1])    \n\t"
642 
643             :
644             : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
645               [q1_f0] "r" (q1_f0), [s1] "r" (s1)
646         );
647       }
648     }
649   }
650 }
651 #endif  // #if HAVE_DSPR2
652