• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
12 #define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
13 
14 #include <stdlib.h>
15 
16 #include "./vpx_dsp_rtcd.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_mem/vpx_mem.h"
19 #include "vpx_ports/mem.h"
20 
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24 
25 #if HAVE_DSPR2
26 /* inputs & outputs are quad-byte vectors */
filter_dspr2(uint32_t mask,uint32_t hev,uint32_t * ps1,uint32_t * ps0,uint32_t * qs0,uint32_t * qs1)27 static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
28                                 uint32_t *ps1, uint32_t *ps0,
29                                 uint32_t *qs0, uint32_t *qs1) {
30   int32_t   vpx_filter_l, vpx_filter_r;
31   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
32   int32_t   subr_r, subr_l;
33   uint32_t  t1, t2, HWM, t3;
34   uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
35   int32_t   vps1, vps0, vqs0, vqs1;
36   int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
37   uint32_t  N128;
38 
39   N128 = 0x80808080;
40   t1  = 0x03000300;
41   t2  = 0x04000400;
42   t3  = 0x01000100;
43   HWM = 0xFF00FF00;
44 
45   vps0 = (*ps0) ^ N128;
46   vps1 = (*ps1) ^ N128;
47   vqs0 = (*qs0) ^ N128;
48   vqs1 = (*qs1) ^ N128;
49 
50   /* use halfword pairs instead quad-bytes because of accuracy */
51   vps0_l = vps0 & HWM;
52   vps0_r = vps0 << 8;
53   vps0_r = vps0_r & HWM;
54 
55   vps1_l = vps1 & HWM;
56   vps1_r = vps1 << 8;
57   vps1_r = vps1_r & HWM;
58 
59   vqs0_l = vqs0 & HWM;
60   vqs0_r = vqs0 << 8;
61   vqs0_r = vqs0_r & HWM;
62 
63   vqs1_l = vqs1 & HWM;
64   vqs1_r = vqs1 << 8;
65   vqs1_r = vqs1_r & HWM;
66 
67   mask_l = mask & HWM;
68   mask_r = mask << 8;
69   mask_r = mask_r & HWM;
70 
71   hev_l = hev & HWM;
72   hev_r = hev << 8;
73   hev_r = hev_r & HWM;
74 
75   __asm__ __volatile__ (
76       /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
77       "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
78       "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
79 
80       /* qs0 - ps0 */
81       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
82       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
83 
84       /* vpx_filter &= hev; */
85       "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
86       "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
87 
88       /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
89       "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
90       "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
91       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
92       "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
93       "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
94       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
95       "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
96       "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
97 
98       /* vpx_filter &= mask; */
99       "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
100       "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
101 
102       : [vpx_filter_l] "=&r" (vpx_filter_l),
103         [vpx_filter_r] "=&r" (vpx_filter_r),
104         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
105         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
106       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
107         [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
108         [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
109         [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
110         [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
111         [HWM] "r" (HWM)
112   );
113 
114   /* save bottom 3 bits so that we round one side +4 and the other +3 */
115   __asm__ __volatile__ (
116       /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
117       "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
118       "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
119 
120       /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
121       "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
122       "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
123       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
124       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
125 
126       "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
127       "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
128 
129       "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
130       "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
131 
132       /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
133       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
134       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
135 
136       /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
137       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
138       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
139 
140       : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
141         [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
142         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
143         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
144       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
145         [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
146   );
147 
148   __asm__ __volatile__ (
149       /* (vpx_filter += 1) >>= 1 */
150       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
151       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
152 
153       /* vpx_filter &= ~hev; */
154       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
155       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
156 
157       /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
158       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
159       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
160 
161       /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
162       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
163       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
164 
165       : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
166         [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
167         [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
168       : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
169   );
170 
171   /* Create quad-bytes from halfword pairs */
172   vqs0_l = vqs0_l & HWM;
173   vqs1_l = vqs1_l & HWM;
174   vps0_l = vps0_l & HWM;
175   vps1_l = vps1_l & HWM;
176 
177   __asm__ __volatile__ (
178       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
179       "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
180       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
181       "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
182 
183       : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
184         [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
185       :
186   );
187 
188   vqs0 = vqs0_l | vqs0_r;
189   vqs1 = vqs1_l | vqs1_r;
190   vps0 = vps0_l | vps0_r;
191   vps1 = vps1_l | vps1_r;
192 
193   *ps0 = vps0 ^ N128;
194   *ps1 = vps1 ^ N128;
195   *qs0 = vqs0 ^ N128;
196   *qs1 = vqs1 ^ N128;
197 }
198 
filter1_dspr2(uint32_t mask,uint32_t hev,uint32_t ps1,uint32_t ps0,uint32_t qs0,uint32_t qs1,uint32_t * p1_f0,uint32_t * p0_f0,uint32_t * q0_f0,uint32_t * q1_f0)199 static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
200                                  uint32_t ps1, uint32_t ps0,
201                                  uint32_t qs0, uint32_t qs1,
202                                  uint32_t *p1_f0, uint32_t *p0_f0,
203                                  uint32_t *q0_f0, uint32_t *q1_f0) {
204   int32_t   vpx_filter_l, vpx_filter_r;
205   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
206   int32_t   subr_r, subr_l;
207   uint32_t  t1, t2, HWM, t3;
208   uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
209   int32_t   vps1, vps0, vqs0, vqs1;
210   int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
211   uint32_t  N128;
212 
213   N128 = 0x80808080;
214   t1  = 0x03000300;
215   t2  = 0x04000400;
216   t3  = 0x01000100;
217   HWM = 0xFF00FF00;
218 
219   vps0 = (ps0) ^ N128;
220   vps1 = (ps1) ^ N128;
221   vqs0 = (qs0) ^ N128;
222   vqs1 = (qs1) ^ N128;
223 
224   /* use halfword pairs instead quad-bytes because of accuracy */
225   vps0_l = vps0 & HWM;
226   vps0_r = vps0 << 8;
227   vps0_r = vps0_r & HWM;
228 
229   vps1_l = vps1 & HWM;
230   vps1_r = vps1 << 8;
231   vps1_r = vps1_r & HWM;
232 
233   vqs0_l = vqs0 & HWM;
234   vqs0_r = vqs0 << 8;
235   vqs0_r = vqs0_r & HWM;
236 
237   vqs1_l = vqs1 & HWM;
238   vqs1_r = vqs1 << 8;
239   vqs1_r = vqs1_r & HWM;
240 
241   mask_l = mask & HWM;
242   mask_r = mask << 8;
243   mask_r = mask_r & HWM;
244 
245   hev_l = hev & HWM;
246   hev_r = hev << 8;
247   hev_r = hev_r & HWM;
248 
249   __asm__ __volatile__ (
250       /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
251       "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
252       "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
253 
254       /* qs0 - ps0 */
255       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
256       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
257 
258       /* vpx_filter &= hev; */
259       "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
260       "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
261 
262       /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
263       "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
264       "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
265       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
266       "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
267       "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
268       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
269       "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
270       "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
271 
272       /* vpx_filter &= mask; */
273       "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
274       "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
275 
276       : [vpx_filter_l] "=&r" (vpx_filter_l),
277         [vpx_filter_r] "=&r" (vpx_filter_r),
278         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
279         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
280       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
281         [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
282         [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
283         [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
284         [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
285   );
286 
287   /* save bottom 3 bits so that we round one side +4 and the other +3 */
288   __asm__ __volatile__ (
289       /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
290       "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
291       "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
292 
293       /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
294       "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
295       "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
296       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
297       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
298 
299       "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
300       "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
301 
302       "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
303       "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
304 
305       /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
306       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
307       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
308 
309       /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
310       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
311       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
312 
313       : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
314         [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
315         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
316         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
317       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
318         [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
319   );
320 
321   __asm__ __volatile__ (
322       /* (vpx_filter += 1) >>= 1 */
323       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
324       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
325 
326       /* vpx_filter &= ~hev; */
327       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
328       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
329 
330       /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
331       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
332       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
333 
334       /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
335       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
336       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
337 
338       : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
339         [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
340         [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
341       : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
342   );
343 
344   /* Create quad-bytes from halfword pairs */
345   vqs0_l = vqs0_l & HWM;
346   vqs1_l = vqs1_l & HWM;
347   vps0_l = vps0_l & HWM;
348   vps1_l = vps1_l & HWM;
349 
350   __asm__ __volatile__ (
351       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
352       "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
353       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
354       "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
355 
356       : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
357         [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
358       :
359   );
360 
361   vqs0 = vqs0_l | vqs0_r;
362   vqs1 = vqs1_l | vqs1_r;
363   vps0 = vps0_l | vps0_r;
364   vps1 = vps1_l | vps1_r;
365 
366   *p0_f0 = vps0 ^ N128;
367   *p1_f0 = vps1 ^ N128;
368   *q0_f0 = vqs0 ^ N128;
369   *q1_f0 = vqs1 ^ N128;
370 }
371 
mbfilter_dspr2(uint32_t * op3,uint32_t * op2,uint32_t * op1,uint32_t * op0,uint32_t * oq0,uint32_t * oq1,uint32_t * oq2,uint32_t * oq3)372 static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
373                                   uint32_t *op1, uint32_t *op0,
374                                   uint32_t *oq0, uint32_t *oq1,
375                                   uint32_t *oq2, uint32_t *oq3) {
376   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
377   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
378   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
379   uint32_t       res_op2, res_op1, res_op0;
380   uint32_t       res_oq0, res_oq1, res_oq2;
381   uint32_t       tmp;
382   uint32_t       add_p210_q012;
383   uint32_t       u32Four = 0x00040004;
384 
385   /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
386   /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
387   /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
388   /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
389   /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
390   /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
391 
392   __asm__ __volatile__ (
393       "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
394       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
395       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
396       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
397       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
398       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
399 
400       "shll.ph    %[tmp],            %[p3],             1                \n\t"
401       "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
402       "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
403       "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
404       "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
405       "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
406       "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
407       "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
408       "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
409       "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
410       "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
411       "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
412       "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
413       "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
414       "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
415       "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
416       "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
417       "shll.ph    %[tmp],            %[q3],             1                \n\t"
418       "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
419       "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
420       "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
421       "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
422       "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
423       "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
424       "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
425       "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
426       "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
427       "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
428       "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
429       "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
430 
431       : [add_p210_q012] "=&r" (add_p210_q012),
432         [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
433         [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
434         [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
435         [res_oq2] "=&r" (res_oq2)
436       : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
437         [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
438         [u32Four] "r" (u32Four)
439   );
440 
441   *op2 = res_op2;
442   *op1 = res_op1;
443   *op0 = res_op0;
444   *oq0 = res_oq0;
445   *oq1 = res_oq1;
446   *oq2 = res_oq2;
447 }
448 
mbfilter1_dspr2(uint32_t p3,uint32_t p2,uint32_t p1,uint32_t p0,uint32_t q0,uint32_t q1,uint32_t q2,uint32_t q3,uint32_t * op2_f1,uint32_t * op1_f1,uint32_t * op0_f1,uint32_t * oq0_f1,uint32_t * oq1_f1,uint32_t * oq2_f1)449 static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
450                                    uint32_t p1, uint32_t p0,
451                                    uint32_t q0, uint32_t q1,
452                                    uint32_t q2, uint32_t q3,
453                                    uint32_t *op2_f1,
454                                    uint32_t *op1_f1, uint32_t *op0_f1,
455                                    uint32_t *oq0_f1, uint32_t *oq1_f1,
456                                    uint32_t *oq2_f1) {
457   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
458   uint32_t  res_op2, res_op1, res_op0;
459   uint32_t  res_oq0, res_oq1, res_oq2;
460   uint32_t  tmp;
461   uint32_t  add_p210_q012;
462   uint32_t  u32Four = 0x00040004;
463 
464   /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
465   /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
466   /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
467   /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
468   /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
469   /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
470 
471   __asm__ __volatile__ (
472       "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
473       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
474       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
475       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
476       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
477       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
478 
479       "shll.ph    %[tmp],            %[p3],             1                 \n\t"
480       "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
481       "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
482       "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
483       "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
484       "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
485       "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
486       "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
487       "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
488       "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
489       "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
490       "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
491       "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
492       "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
493       "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
494       "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
495       "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
496       "shll.ph    %[tmp],            %[q3],             1                 \n\t"
497       "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
498       "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
499       "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
500       "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
501       "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
502       "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
503       "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
504       "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
505       "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
506       "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
507       "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
508       "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
509 
510       : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
511         [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
512         [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
513         [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
514       : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
515         [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
516         [u32Four] "r" (u32Four)
517   );
518 
519   *op2_f1 = res_op2;
520   *op1_f1 = res_op1;
521   *op0_f1 = res_op0;
522   *oq0_f1 = res_oq0;
523   *oq1_f1 = res_oq1;
524   *oq2_f1 = res_oq2;
525 }
526 
wide_mbfilter_dspr2(uint32_t * op7,uint32_t * op6,uint32_t * op5,uint32_t * op4,uint32_t * op3,uint32_t * op2,uint32_t * op1,uint32_t * op0,uint32_t * oq0,uint32_t * oq1,uint32_t * oq2,uint32_t * oq3,uint32_t * oq4,uint32_t * oq5,uint32_t * oq6,uint32_t * oq7)527 static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
528                                        uint32_t *op5, uint32_t *op4,
529                                        uint32_t *op3, uint32_t *op2,
530                                        uint32_t *op1, uint32_t *op0,
531                                        uint32_t *oq0, uint32_t *oq1,
532                                        uint32_t *oq2, uint32_t *oq3,
533                                        uint32_t *oq4, uint32_t *oq5,
534                                        uint32_t *oq6, uint32_t *oq7) {
535   const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
536   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
537   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
538   const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
539   uint32_t       res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
540   uint32_t       res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
541   uint32_t       tmp;
542   uint32_t       add_p6toq6;
543   uint32_t       u32Eight = 0x00080008;
544 
545   __asm__ __volatile__ (
546       /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
547          which is used most of the time */
548       "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
549       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
550       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
551       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
552       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
553       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
554       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
555       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
556       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
557       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
558       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
559       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
560       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
561       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
562 
563       : [add_p6toq6] "=&r" (add_p6toq6)
564       : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
565         [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
566         [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
567         [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
568         [u32Eight] "r" (u32Eight)
569   );
570 
571   __asm__ __volatile__ (
572       /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
573                                    p3 + p2 + p1 + p0 + q0, 4) */
574       "shll.ph       %[tmp],            %[p7],            3               \n\t"
575       "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
576       "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
577       "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
578       "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
579       "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
580       "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
581       "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
582       "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
583       "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
584       "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
585 
586       /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
587                                    p2 + p1 + p0 + q0 + q1, 4) */
588       "shll.ph       %[tmp],            %[p7],            2               \n\t"
589       "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
590       "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
591       "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
592       "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
593       "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
594       "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
595       "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
596       "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
597       "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
598       "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
599 
600       /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
601                                    p1 + p0 + q0 + q1 + q2, 4) */
602       "shll.ph       %[tmp],            %[p7],            2               \n\t"
603       "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
604       "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
605       "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
606       "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
607       "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
608       "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
609       "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
610       "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
611 
612       /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
613                                    p1 + p0 + q0 + q1 + q2 + q3, 4) */
614       "shll.ph       %[tmp],            %[p7],            2               \n\t"
615       "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
616       "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
617       "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
618       "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
619       "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
620       "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
621 
622       /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
623                                    p0 + q0 + q1 + q2 + q3 + q4, 4) */
624       "shll.ph       %[tmp],            %[p7],            1               \n\t"
625       "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
626       "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
627       "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
628       "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
629       "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
630       "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
631 
632       /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
633                                    p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
634       "shll.ph       %[tmp],            %[p7],            1               \n\t"
635       "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
636       "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
637       "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
638       "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
639 
640       /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
641                                   q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
642       "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
643       "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
644       "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
645 
646       : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
647         [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
648         [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
649         [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
650       : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
651         [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
652         [q2] "r" (q2), [q1] "r" (q1),
653         [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
654         [add_p6toq6] "r" (add_p6toq6)
655   );
656 
657   *op6 = res_op6;
658   *op5 = res_op5;
659   *op4 = res_op4;
660   *op3 = res_op3;
661   *op2 = res_op2;
662   *op1 = res_op1;
663   *op0 = res_op0;
664 
665   __asm__ __volatile__ (
666       /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
667                                    q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
668       "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
669       "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
670       "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
671 
672       /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
673                                    q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
674       "shll.ph       %[tmp],            %[q7],            1               \n\t"
675       "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
676       "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
677       "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
678       "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
679 
680       /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
681                                    q3 + q4 + q5 + q6 + q7 * 3, 4) */
682       "shll.ph       %[tmp],            %[q7],            1               \n\t"
683       "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
684       "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
685       "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
686       "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
687       "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
688       "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
689 
690       /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
691                                    q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
692       "shll.ph       %[tmp],            %[q7],            2               \n\t"
693       "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
694       "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
695       "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
696       "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
697       "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
698       "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
699 
700       /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
701                                    q4 * 2 + q5 + q6 + q7 * 5, 4) */
702       "shll.ph       %[tmp],            %[q7],            2               \n\t"
703       "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
704       "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
705       "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
706       "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
707       "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
708       "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
709       "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
710       "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
711 
712       /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
713                                    q5 * 2 + q6 + q7 * 6, 4) */
714       "shll.ph       %[tmp],            %[q7],            2               \n\t"
715       "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
716       "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
717       "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
718       "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
719       "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
720       "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
721       "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
722       "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
723       "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
724       "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
725 
726       /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
727                                    q4 + q5 + q6 * 2 + q7 * 7, 4) */
728       "shll.ph       %[tmp],            %[q7],            3               \n\t"
729       "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
730       "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
731       "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
732       "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
733       "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
734       "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
735       "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
736       "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
737       "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
738       "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
739 
740       : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
741         [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
742         [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
743         [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
744       : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
745         [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
746         [p1] "r" (p1), [p2] "r" (p2),
747         [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
748         [add_p6toq6] "r" (add_p6toq6)
749   );
750 
751   *oq0 = res_oq0;
752   *oq1 = res_oq1;
753   *oq2 = res_oq2;
754   *oq3 = res_oq3;
755   *oq4 = res_oq4;
756   *oq5 = res_oq5;
757   *oq6 = res_oq6;
758 }
759 #endif  // #if HAVE_DSPR2
760 #ifdef __cplusplus
761 }  // extern "C"
762 #endif
763 
764 #endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
765