1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
12 #define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
13
14 #include <stdlib.h>
15
16 #include "./vpx_dsp_rtcd.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_mem/vpx_mem.h"
19 #include "vpx_ports/mem.h"
20
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24
25 #if HAVE_DSPR2
26 /* inputs & outputs are quad-byte vectors */
filter_dspr2(uint32_t mask,uint32_t hev,uint32_t * ps1,uint32_t * ps0,uint32_t * qs0,uint32_t * qs1)27 static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
28 uint32_t *ps1, uint32_t *ps0,
29 uint32_t *qs0, uint32_t *qs1) {
30 int32_t vpx_filter_l, vpx_filter_r;
31 int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
32 int32_t subr_r, subr_l;
33 uint32_t t1, t2, HWM, t3;
34 uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
35 int32_t vps1, vps0, vqs0, vqs1;
36 int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
37 uint32_t N128;
38
39 N128 = 0x80808080;
40 t1 = 0x03000300;
41 t2 = 0x04000400;
42 t3 = 0x01000100;
43 HWM = 0xFF00FF00;
44
45 vps0 = (*ps0) ^ N128;
46 vps1 = (*ps1) ^ N128;
47 vqs0 = (*qs0) ^ N128;
48 vqs1 = (*qs1) ^ N128;
49
50 /* use halfword pairs instead quad-bytes because of accuracy */
51 vps0_l = vps0 & HWM;
52 vps0_r = vps0 << 8;
53 vps0_r = vps0_r & HWM;
54
55 vps1_l = vps1 & HWM;
56 vps1_r = vps1 << 8;
57 vps1_r = vps1_r & HWM;
58
59 vqs0_l = vqs0 & HWM;
60 vqs0_r = vqs0 << 8;
61 vqs0_r = vqs0_r & HWM;
62
63 vqs1_l = vqs1 & HWM;
64 vqs1_r = vqs1 << 8;
65 vqs1_r = vqs1_r & HWM;
66
67 mask_l = mask & HWM;
68 mask_r = mask << 8;
69 mask_r = mask_r & HWM;
70
71 hev_l = hev & HWM;
72 hev_r = hev << 8;
73 hev_r = hev_r & HWM;
74
75 __asm__ __volatile__ (
76 /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
77 "subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t"
78 "subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t"
79
80 /* qs0 - ps0 */
81 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
82 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
83
84 /* vpx_filter &= hev; */
85 "and %[vpx_filter_l], %[vpx_filter_l], %[hev_l] \n\t"
86 "and %[vpx_filter_r], %[vpx_filter_r], %[hev_r] \n\t"
87
88 /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
89 "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
90 "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
91 "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
92 "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
93 "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
94 "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
95 "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
96 "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
97
98 /* vpx_filter &= mask; */
99 "and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t"
100 "and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t"
101
102 : [vpx_filter_l] "=&r" (vpx_filter_l),
103 [vpx_filter_r] "=&r" (vpx_filter_r),
104 [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
105 [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
106 : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
107 [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
108 [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
109 [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
110 [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
111 [HWM] "r" (HWM)
112 );
113
114 /* save bottom 3 bits so that we round one side +4 and the other +3 */
115 __asm__ __volatile__ (
116 /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
117 "addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t"
118 "addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t"
119
120 /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
121 "addq_s.ph %[Filter2_l], %[vpx_filter_l], %[t1] \n\t"
122 "addq_s.ph %[Filter2_r], %[vpx_filter_r], %[t1] \n\t"
123 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
124 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
125
126 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
127 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
128
129 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
130 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
131
132 /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
133 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
134 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
135
136 /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
137 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
138 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
139
140 : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
141 [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
142 [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
143 [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
144 : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
145 [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
146 );
147
148 __asm__ __volatile__ (
149 /* (vpx_filter += 1) >>= 1 */
150 "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
151 "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
152
153 /* vpx_filter &= ~hev; */
154 "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
155 "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
156
157 /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
158 "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
159 "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
160
161 /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
162 "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
163 "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
164
165 : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
166 [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
167 [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
168 : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
169 );
170
171 /* Create quad-bytes from halfword pairs */
172 vqs0_l = vqs0_l & HWM;
173 vqs1_l = vqs1_l & HWM;
174 vps0_l = vps0_l & HWM;
175 vps1_l = vps1_l & HWM;
176
177 __asm__ __volatile__ (
178 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
179 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
180 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
181 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
182
183 : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
184 [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
185 :
186 );
187
188 vqs0 = vqs0_l | vqs0_r;
189 vqs1 = vqs1_l | vqs1_r;
190 vps0 = vps0_l | vps0_r;
191 vps1 = vps1_l | vps1_r;
192
193 *ps0 = vps0 ^ N128;
194 *ps1 = vps1 ^ N128;
195 *qs0 = vqs0 ^ N128;
196 *qs1 = vqs1 ^ N128;
197 }
198
filter1_dspr2(uint32_t mask,uint32_t hev,uint32_t ps1,uint32_t ps0,uint32_t qs0,uint32_t qs1,uint32_t * p1_f0,uint32_t * p0_f0,uint32_t * q0_f0,uint32_t * q1_f0)199 static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
200 uint32_t ps1, uint32_t ps0,
201 uint32_t qs0, uint32_t qs1,
202 uint32_t *p1_f0, uint32_t *p0_f0,
203 uint32_t *q0_f0, uint32_t *q1_f0) {
204 int32_t vpx_filter_l, vpx_filter_r;
205 int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
206 int32_t subr_r, subr_l;
207 uint32_t t1, t2, HWM, t3;
208 uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
209 int32_t vps1, vps0, vqs0, vqs1;
210 int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
211 uint32_t N128;
212
213 N128 = 0x80808080;
214 t1 = 0x03000300;
215 t2 = 0x04000400;
216 t3 = 0x01000100;
217 HWM = 0xFF00FF00;
218
219 vps0 = (ps0) ^ N128;
220 vps1 = (ps1) ^ N128;
221 vqs0 = (qs0) ^ N128;
222 vqs1 = (qs1) ^ N128;
223
224 /* use halfword pairs instead quad-bytes because of accuracy */
225 vps0_l = vps0 & HWM;
226 vps0_r = vps0 << 8;
227 vps0_r = vps0_r & HWM;
228
229 vps1_l = vps1 & HWM;
230 vps1_r = vps1 << 8;
231 vps1_r = vps1_r & HWM;
232
233 vqs0_l = vqs0 & HWM;
234 vqs0_r = vqs0 << 8;
235 vqs0_r = vqs0_r & HWM;
236
237 vqs1_l = vqs1 & HWM;
238 vqs1_r = vqs1 << 8;
239 vqs1_r = vqs1_r & HWM;
240
241 mask_l = mask & HWM;
242 mask_r = mask << 8;
243 mask_r = mask_r & HWM;
244
245 hev_l = hev & HWM;
246 hev_r = hev << 8;
247 hev_r = hev_r & HWM;
248
249 __asm__ __volatile__ (
250 /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
251 "subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t"
252 "subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t"
253
254 /* qs0 - ps0 */
255 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
256 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
257
258 /* vpx_filter &= hev; */
259 "and %[vpx_filter_l], %[vpx_filter_l], %[hev_l] \n\t"
260 "and %[vpx_filter_r], %[vpx_filter_r], %[hev_r] \n\t"
261
262 /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
263 "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
264 "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
265 "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
266 "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
267 "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
268 "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
269 "addq_s.ph %[vpx_filter_l], %[vpx_filter_l], %[subr_l] \n\t"
270 "addq_s.ph %[vpx_filter_r], %[vpx_filter_r], %[subr_r] \n\t"
271
272 /* vpx_filter &= mask; */
273 "and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t"
274 "and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t"
275
276 : [vpx_filter_l] "=&r" (vpx_filter_l),
277 [vpx_filter_r] "=&r" (vpx_filter_r),
278 [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
279 [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
280 : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
281 [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
282 [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
283 [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
284 [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
285 );
286
287 /* save bottom 3 bits so that we round one side +4 and the other +3 */
288 __asm__ __volatile__ (
289 /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
290 "addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t"
291 "addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t"
292
293 /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
294 "addq_s.ph %[Filter2_l], %[vpx_filter_l], %[t1] \n\t"
295 "addq_s.ph %[Filter2_r], %[vpx_filter_r], %[t1] \n\t"
296 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
297 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
298
299 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
300 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
301
302 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
303 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
304
305 /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
306 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
307 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
308
309 /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
310 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
311 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
312
313 : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
314 [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
315 [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
316 [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
317 : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
318 [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
319 );
320
321 __asm__ __volatile__ (
322 /* (vpx_filter += 1) >>= 1 */
323 "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
324 "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
325
326 /* vpx_filter &= ~hev; */
327 "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
328 "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
329
330 /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
331 "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
332 "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
333
334 /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
335 "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
336 "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
337
338 : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
339 [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
340 [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
341 : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
342 );
343
344 /* Create quad-bytes from halfword pairs */
345 vqs0_l = vqs0_l & HWM;
346 vqs1_l = vqs1_l & HWM;
347 vps0_l = vps0_l & HWM;
348 vps1_l = vps1_l & HWM;
349
350 __asm__ __volatile__ (
351 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
352 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
353 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
354 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
355
356 : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
357 [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
358 :
359 );
360
361 vqs0 = vqs0_l | vqs0_r;
362 vqs1 = vqs1_l | vqs1_r;
363 vps0 = vps0_l | vps0_r;
364 vps1 = vps1_l | vps1_r;
365
366 *p0_f0 = vps0 ^ N128;
367 *p1_f0 = vps1 ^ N128;
368 *q0_f0 = vqs0 ^ N128;
369 *q1_f0 = vqs1 ^ N128;
370 }
371
mbfilter_dspr2(uint32_t * op3,uint32_t * op2,uint32_t * op1,uint32_t * op0,uint32_t * oq0,uint32_t * oq1,uint32_t * oq2,uint32_t * oq3)372 static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
373 uint32_t *op1, uint32_t *op0,
374 uint32_t *oq0, uint32_t *oq1,
375 uint32_t *oq2, uint32_t *oq3) {
376 /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
377 const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
378 const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
379 uint32_t res_op2, res_op1, res_op0;
380 uint32_t res_oq0, res_oq1, res_oq2;
381 uint32_t tmp;
382 uint32_t add_p210_q012;
383 uint32_t u32Four = 0x00040004;
384
385 /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
386 /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
387 /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
388 /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
389 /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
390 /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
391
392 __asm__ __volatile__ (
393 "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
394 "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
395 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
396 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
397 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
398 "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
399
400 "shll.ph %[tmp], %[p3], 1 \n\t"
401 "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
402 "addu.ph %[res_op1], %[p3], %[p3] \n\t"
403 "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
404 "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
405 "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
406 "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
407 "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
408 "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
409 "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
410 "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
411 "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
412 "addu.ph %[res_op0], %[p3], %[p0] \n\t"
413 "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
414 "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
415 "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
416 "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
417 "shll.ph %[tmp], %[q3], 1 \n\t"
418 "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
419 "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
420 "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
421 "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
422 "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
423 "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
424 "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
425 "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
426 "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
427 "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
428 "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
429 "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
430
431 : [add_p210_q012] "=&r" (add_p210_q012),
432 [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
433 [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
434 [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
435 [res_oq2] "=&r" (res_oq2)
436 : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
437 [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
438 [u32Four] "r" (u32Four)
439 );
440
441 *op2 = res_op2;
442 *op1 = res_op1;
443 *op0 = res_op0;
444 *oq0 = res_oq0;
445 *oq1 = res_oq1;
446 *oq2 = res_oq2;
447 }
448
mbfilter1_dspr2(uint32_t p3,uint32_t p2,uint32_t p1,uint32_t p0,uint32_t q0,uint32_t q1,uint32_t q2,uint32_t q3,uint32_t * op2_f1,uint32_t * op1_f1,uint32_t * op0_f1,uint32_t * oq0_f1,uint32_t * oq1_f1,uint32_t * oq2_f1)449 static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
450 uint32_t p1, uint32_t p0,
451 uint32_t q0, uint32_t q1,
452 uint32_t q2, uint32_t q3,
453 uint32_t *op2_f1,
454 uint32_t *op1_f1, uint32_t *op0_f1,
455 uint32_t *oq0_f1, uint32_t *oq1_f1,
456 uint32_t *oq2_f1) {
457 /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
458 uint32_t res_op2, res_op1, res_op0;
459 uint32_t res_oq0, res_oq1, res_oq2;
460 uint32_t tmp;
461 uint32_t add_p210_q012;
462 uint32_t u32Four = 0x00040004;
463
464 /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
465 /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
466 /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
467 /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
468 /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
469 /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
470
471 __asm__ __volatile__ (
472 "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
473 "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
474 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
475 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
476 "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
477 "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
478
479 "shll.ph %[tmp], %[p3], 1 \n\t"
480 "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
481 "addu.ph %[res_op1], %[p3], %[p3] \n\t"
482 "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
483 "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
484 "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
485 "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
486 "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
487 "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
488 "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
489 "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
490 "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
491 "addu.ph %[res_op0], %[p3], %[p0] \n\t"
492 "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
493 "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
494 "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
495 "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
496 "shll.ph %[tmp], %[q3], 1 \n\t"
497 "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
498 "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
499 "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
500 "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
501 "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
502 "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
503 "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
504 "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
505 "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
506 "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
507 "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
508 "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
509
510 : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
511 [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
512 [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
513 [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
514 : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
515 [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
516 [u32Four] "r" (u32Four)
517 );
518
519 *op2_f1 = res_op2;
520 *op1_f1 = res_op1;
521 *op0_f1 = res_op0;
522 *oq0_f1 = res_oq0;
523 *oq1_f1 = res_oq1;
524 *oq2_f1 = res_oq2;
525 }
526
wide_mbfilter_dspr2(uint32_t * op7,uint32_t * op6,uint32_t * op5,uint32_t * op4,uint32_t * op3,uint32_t * op2,uint32_t * op1,uint32_t * op0,uint32_t * oq0,uint32_t * oq1,uint32_t * oq2,uint32_t * oq3,uint32_t * oq4,uint32_t * oq5,uint32_t * oq6,uint32_t * oq7)527 static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
528 uint32_t *op5, uint32_t *op4,
529 uint32_t *op3, uint32_t *op2,
530 uint32_t *op1, uint32_t *op0,
531 uint32_t *oq0, uint32_t *oq1,
532 uint32_t *oq2, uint32_t *oq3,
533 uint32_t *oq4, uint32_t *oq5,
534 uint32_t *oq6, uint32_t *oq7) {
535 const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
536 const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
537 const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
538 const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
539 uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
540 uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
541 uint32_t tmp;
542 uint32_t add_p6toq6;
543 uint32_t u32Eight = 0x00080008;
544
545 __asm__ __volatile__ (
546 /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
547 which is used most of the time */
548 "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t"
549 "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t"
550 "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t"
551 "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t"
552 "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t"
553 "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t"
554 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t"
555 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t"
556 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t"
557 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t"
558 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t"
559 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t"
560 "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t"
561 "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t"
562
563 : [add_p6toq6] "=&r" (add_p6toq6)
564 : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
565 [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
566 [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
567 [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
568 [u32Eight] "r" (u32Eight)
569 );
570
571 __asm__ __volatile__ (
572 /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
573 p3 + p2 + p1 + p0 + q0, 4) */
574 "shll.ph %[tmp], %[p7], 3 \n\t"
575 "subu.ph %[res_op6], %[tmp], %[p7] \n\t"
576 "addu.ph %[res_op6], %[res_op6], %[p6] \n\t"
577 "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t"
578 "subu.ph %[res_op6], %[res_op6], %[q1] \n\t"
579 "subu.ph %[res_op6], %[res_op6], %[q2] \n\t"
580 "subu.ph %[res_op6], %[res_op6], %[q3] \n\t"
581 "subu.ph %[res_op6], %[res_op6], %[q4] \n\t"
582 "subu.ph %[res_op6], %[res_op6], %[q5] \n\t"
583 "subu.ph %[res_op6], %[res_op6], %[q6] \n\t"
584 "shrl.ph %[res_op6], %[res_op6], 4 \n\t"
585
586 /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
587 p2 + p1 + p0 + q0 + q1, 4) */
588 "shll.ph %[tmp], %[p7], 2 \n\t"
589 "addu.ph %[res_op5], %[tmp], %[p7] \n\t"
590 "addu.ph %[res_op5], %[res_op5], %[p7] \n\t"
591 "addu.ph %[res_op5], %[res_op5], %[p5] \n\t"
592 "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t"
593 "subu.ph %[res_op5], %[res_op5], %[q2] \n\t"
594 "subu.ph %[res_op5], %[res_op5], %[q3] \n\t"
595 "subu.ph %[res_op5], %[res_op5], %[q4] \n\t"
596 "subu.ph %[res_op5], %[res_op5], %[q5] \n\t"
597 "subu.ph %[res_op5], %[res_op5], %[q6] \n\t"
598 "shrl.ph %[res_op5], %[res_op5], 4 \n\t"
599
600 /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
601 p1 + p0 + q0 + q1 + q2, 4) */
602 "shll.ph %[tmp], %[p7], 2 \n\t"
603 "addu.ph %[res_op4], %[tmp], %[p7] \n\t"
604 "addu.ph %[res_op4], %[res_op4], %[p4] \n\t"
605 "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t"
606 "subu.ph %[res_op4], %[res_op4], %[q3] \n\t"
607 "subu.ph %[res_op4], %[res_op4], %[q4] \n\t"
608 "subu.ph %[res_op4], %[res_op4], %[q5] \n\t"
609 "subu.ph %[res_op4], %[res_op4], %[q6] \n\t"
610 "shrl.ph %[res_op4], %[res_op4], 4 \n\t"
611
612 /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
613 p1 + p0 + q0 + q1 + q2 + q3, 4) */
614 "shll.ph %[tmp], %[p7], 2 \n\t"
615 "addu.ph %[res_op3], %[tmp], %[p3] \n\t"
616 "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t"
617 "subu.ph %[res_op3], %[res_op3], %[q4] \n\t"
618 "subu.ph %[res_op3], %[res_op3], %[q5] \n\t"
619 "subu.ph %[res_op3], %[res_op3], %[q6] \n\t"
620 "shrl.ph %[res_op3], %[res_op3], 4 \n\t"
621
622 /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
623 p0 + q0 + q1 + q2 + q3 + q4, 4) */
624 "shll.ph %[tmp], %[p7], 1 \n\t"
625 "addu.ph %[res_op2], %[tmp], %[p7] \n\t"
626 "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
627 "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t"
628 "subu.ph %[res_op2], %[res_op2], %[q5] \n\t"
629 "subu.ph %[res_op2], %[res_op2], %[q6] \n\t"
630 "shrl.ph %[res_op2], %[res_op2], 4 \n\t"
631
632 /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
633 p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
634 "shll.ph %[tmp], %[p7], 1 \n\t"
635 "addu.ph %[res_op1], %[tmp], %[p1] \n\t"
636 "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t"
637 "subu.ph %[res_op1], %[res_op1], %[q6] \n\t"
638 "shrl.ph %[res_op1], %[res_op1], 4 \n\t"
639
640 /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
641 q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
642 "addu.ph %[res_op0], %[p7], %[p0] \n\t"
643 "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t"
644 "shrl.ph %[res_op0], %[res_op0], 4 \n\t"
645
646 : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
647 [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
648 [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
649 [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
650 : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
651 [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
652 [q2] "r" (q2), [q1] "r" (q1),
653 [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
654 [add_p6toq6] "r" (add_p6toq6)
655 );
656
657 *op6 = res_op6;
658 *op5 = res_op5;
659 *op4 = res_op4;
660 *op3 = res_op3;
661 *op2 = res_op2;
662 *op1 = res_op1;
663 *op0 = res_op0;
664
665 __asm__ __volatile__ (
666 /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
667 q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
668 "addu.ph %[res_oq0], %[q7], %[q0] \n\t"
669 "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t"
670 "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t"
671
672 /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
673 q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
674 "shll.ph %[tmp], %[q7], 1 \n\t"
675 "addu.ph %[res_oq1], %[tmp], %[q1] \n\t"
676 "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t"
677 "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t"
678 "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t"
679
680 /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
681 q3 + q4 + q5 + q6 + q7 * 3, 4) */
682 "shll.ph %[tmp], %[q7], 1 \n\t"
683 "addu.ph %[res_oq2], %[tmp], %[q7] \n\t"
684 "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
685 "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t"
686 "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t"
687 "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t"
688 "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t"
689
690 /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
691 q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
692 "shll.ph %[tmp], %[q7], 2 \n\t"
693 "addu.ph %[res_oq3], %[tmp], %[q3] \n\t"
694 "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t"
695 "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t"
696 "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t"
697 "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t"
698 "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t"
699
700 /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
701 q4 * 2 + q5 + q6 + q7 * 5, 4) */
702 "shll.ph %[tmp], %[q7], 2 \n\t"
703 "addu.ph %[res_oq4], %[tmp], %[q7] \n\t"
704 "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t"
705 "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t"
706 "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t"
707 "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t"
708 "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t"
709 "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t"
710 "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t"
711
712 /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
713 q5 * 2 + q6 + q7 * 6, 4) */
714 "shll.ph %[tmp], %[q7], 2 \n\t"
715 "addu.ph %[res_oq5], %[tmp], %[q7] \n\t"
716 "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t"
717 "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t"
718 "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t"
719 "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t"
720 "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t"
721 "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t"
722 "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t"
723 "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t"
724 "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t"
725
726 /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
727 q4 + q5 + q6 * 2 + q7 * 7, 4) */
728 "shll.ph %[tmp], %[q7], 3 \n\t"
729 "subu.ph %[res_oq6], %[tmp], %[q7] \n\t"
730 "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t"
731 "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t"
732 "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t"
733 "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t"
734 "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t"
735 "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t"
736 "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t"
737 "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t"
738 "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t"
739
740 : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
741 [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
742 [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
743 [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
744 : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
745 [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
746 [p1] "r" (p1), [p2] "r" (p2),
747 [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
748 [add_p6toq6] "r" (add_p6toq6)
749 );
750
751 *oq0 = res_oq0;
752 *oq1 = res_oq1;
753 *oq2 = res_oq2;
754 *oq3 = res_oq3;
755 *oq4 = res_oq4;
756 *oq5 = res_oq5;
757 *oq6 = res_oq6;
758 }
759 #endif // #if HAVE_DSPR2
760 #ifdef __cplusplus
761 } // extern "C"
762 #endif
763
764 #endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
765