1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stdlib.h>
12 #include "vp8_rtcd.h"
13 #include "vp8/common/onyxc_int.h"
14
15 #if HAVE_DSPR2
16 typedef unsigned char uc;
17
18 /* prefetch data for load */
prefetch_load_lf(unsigned char * src)19 inline void prefetch_load_lf(unsigned char *src) {
20 __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
21 }
22
23 /* prefetch data for store */
prefetch_store_lf(unsigned char * dst)24 inline void prefetch_store_lf(unsigned char *dst) {
25 __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst));
26 }
27
28 /* processing 4 pixels at the same time
29 * compute hev and mask in the same function
30 */
vp8_filter_mask_vec_mips(uint32_t limit,uint32_t flimit,uint32_t p1,uint32_t p0,uint32_t p3,uint32_t p2,uint32_t q0,uint32_t q1,uint32_t q2,uint32_t q3,uint32_t thresh,uint32_t * hev,uint32_t * mask)31 static __inline void vp8_filter_mask_vec_mips(
32 uint32_t limit, uint32_t flimit, uint32_t p1, uint32_t p0, uint32_t p3,
33 uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, uint32_t q3,
34 uint32_t thresh, uint32_t *hev, uint32_t *mask) {
35 uint32_t c, r, r3, r_k;
36 uint32_t s1, s2, s3;
37 uint32_t ones = 0xFFFFFFFF;
38 uint32_t hev1;
39
40 __asm__ __volatile__(
41 /* mask |= (abs(p3 - p2) > limit) */
42 "subu_s.qb %[c], %[p3], %[p2] \n\t"
43 "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
44 "or %[r_k], %[r_k], %[c] \n\t"
45 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
46 "or %[r], $0, %[c] \n\t"
47
48 /* mask |= (abs(p2 - p1) > limit) */
49 "subu_s.qb %[c], %[p2], %[p1] \n\t"
50 "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
51 "or %[r_k], %[r_k], %[c] \n\t"
52 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
53 "or %[r], %[r], %[c] \n\t"
54
55 /* mask |= (abs(p1 - p0) > limit)
56 * hev |= (abs(p1 - p0) > thresh)
57 */
58 "subu_s.qb %[c], %[p1], %[p0] \n\t"
59 "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
60 "or %[r_k], %[r_k], %[c] \n\t"
61 "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
62 "or %[r3], $0, %[c] \n\t"
63 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
64 "or %[r], %[r], %[c] \n\t"
65
66 /* mask |= (abs(q1 - q0) > limit)
67 * hev |= (abs(q1 - q0) > thresh)
68 */
69 "subu_s.qb %[c], %[q1], %[q0] \n\t"
70 "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
71 "or %[r_k], %[r_k], %[c] \n\t"
72 "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
73 "or %[r3], %[r3], %[c] \n\t"
74 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
75 "or %[r], %[r], %[c] \n\t"
76
77 /* mask |= (abs(q2 - q1) > limit) */
78 "subu_s.qb %[c], %[q2], %[q1] \n\t"
79 "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
80 "or %[r_k], %[r_k], %[c] \n\t"
81 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
82 "or %[r], %[r], %[c] \n\t"
83 "sll %[r3], %[r3], 24 \n\t"
84
85 /* mask |= (abs(q3 - q2) > limit) */
86 "subu_s.qb %[c], %[q3], %[q2] \n\t"
87 "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
88 "or %[r_k], %[r_k], %[c] \n\t"
89 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
90 "or %[r], %[r], %[c] \n\t"
91
92 : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
93 : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
94 [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
95 [thresh] "r"(thresh));
96
97 __asm__ __volatile__(
98 /* abs(p0 - q0) */
99 "subu_s.qb %[c], %[p0], %[q0] \n\t"
100 "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
101 "wrdsp %[r3] \n\t"
102 "or %[s1], %[r_k], %[c] \n\t"
103
104 /* abs(p1 - q1) */
105 "subu_s.qb %[c], %[p1], %[q1] \n\t"
106 "addu_s.qb %[s3], %[s1], %[s1] \n\t"
107 "pick.qb %[hev1], %[ones], $0 \n\t"
108 "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
109 "or %[s2], %[r_k], %[c] \n\t"
110
111 /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
112 "shrl.qb %[s2], %[s2], 1 \n\t"
113 "addu_s.qb %[s1], %[s2], %[s3] \n\t"
114 "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
115 "or %[r], %[r], %[c] \n\t"
116 "sll %[r], %[r], 24 \n\t"
117
118 "wrdsp %[r] \n\t"
119 "pick.qb %[s2], $0, %[ones] \n\t"
120
121 : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
122 [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
123 : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
124 [ones] "r"(ones), [flimit] "r"(flimit));
125
126 *hev = hev1;
127 *mask = s2;
128 }
129
130 /* inputs & outputs are quad-byte vectors */
vp8_filter_mips(uint32_t mask,uint32_t hev,uint32_t * ps1,uint32_t * ps0,uint32_t * qs0,uint32_t * qs1)131 static __inline void vp8_filter_mips(uint32_t mask, uint32_t hev, uint32_t *ps1,
132 uint32_t *ps0, uint32_t *qs0,
133 uint32_t *qs1) {
134 int32_t vp8_filter_l, vp8_filter_r;
135 int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
136 int32_t subr_r, subr_l;
137 uint32_t t1, t2, HWM, t3;
138 uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
139
140 int32_t vps1, vps0, vqs0, vqs1;
141 int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
142 uint32_t N128;
143
144 N128 = 0x80808080;
145 t1 = 0x03000300;
146 t2 = 0x04000400;
147 t3 = 0x01000100;
148 HWM = 0xFF00FF00;
149
150 vps0 = (*ps0) ^ N128;
151 vps1 = (*ps1) ^ N128;
152 vqs0 = (*qs0) ^ N128;
153 vqs1 = (*qs1) ^ N128;
154
155 /* use halfword pairs instead quad-bytes because of accuracy */
156 vps0_l = vps0 & HWM;
157 vps0_r = vps0 << 8;
158 vps0_r = vps0_r & HWM;
159
160 vps1_l = vps1 & HWM;
161 vps1_r = vps1 << 8;
162 vps1_r = vps1_r & HWM;
163
164 vqs0_l = vqs0 & HWM;
165 vqs0_r = vqs0 << 8;
166 vqs0_r = vqs0_r & HWM;
167
168 vqs1_l = vqs1 & HWM;
169 vqs1_r = vqs1 << 8;
170 vqs1_r = vqs1_r & HWM;
171
172 mask_l = mask & HWM;
173 mask_r = mask << 8;
174 mask_r = mask_r & HWM;
175
176 hev_l = hev & HWM;
177 hev_r = hev << 8;
178 hev_r = hev_r & HWM;
179
180 __asm__ __volatile__(
181 /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
182 "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t"
183 "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t"
184
185 /* qs0 - ps0 */
186 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
187 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
188
189 /* vp8_filter &= hev; */
190 "and %[vp8_filter_l], %[vp8_filter_l], %[hev_l] \n\t"
191 "and %[vp8_filter_r], %[vp8_filter_r], %[hev_r] \n\t"
192
193 /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
194 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
195 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
196 "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
197 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
198 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
199 "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
200 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
201 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
202
203 /* vp8_filter &= mask; */
204 "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t"
205 "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t"
206
207 : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=&r"(vp8_filter_r),
208 [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
209 [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
210
211 : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
212 [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
213 [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
214 [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
215 [HWM] "r"(HWM));
216
217 /* save bottom 3 bits so that we round one side +4 and the other +3 */
218 __asm__ __volatile__(
219 /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
220 "addq_s.ph %[Filter1_l], %[vp8_filter_l], %[t2] \n\t"
221 "addq_s.ph %[Filter1_r], %[vp8_filter_r], %[t2] \n\t"
222
223 /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
224 "addq_s.ph %[Filter2_l], %[vp8_filter_l], %[t1] \n\t"
225 "addq_s.ph %[Filter2_r], %[vp8_filter_r], %[t1] \n\t"
226 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
227 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
228
229 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
230 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
231
232 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
233 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
234
235 /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
236 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
237 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
238
239 /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
240 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
241 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
242
243 : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
244 [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
245 [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
246 [vqs0_r] "+r"(vqs0_r)
247
248 : [t1] "r"(t1), [t2] "r"(t2), [vp8_filter_l] "r"(vp8_filter_l),
249 [vp8_filter_r] "r"(vp8_filter_r), [HWM] "r"(HWM));
250
251 __asm__ __volatile__(
252 /* (vp8_filter += 1) >>= 1 */
253 "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
254 "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
255
256 /* vp8_filter &= ~hev; */
257 "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
258 "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
259
260 /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
261 "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
262 "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
263
264 /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
265 "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
266 "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
267
268 : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
269 [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
270 [vqs1_r] "+r"(vqs1_r)
271
272 : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
273
274 /* Create quad-bytes from halfword pairs */
275 vqs0_l = vqs0_l & HWM;
276 vqs1_l = vqs1_l & HWM;
277 vps0_l = vps0_l & HWM;
278 vps1_l = vps1_l & HWM;
279
280 __asm__ __volatile__(
281 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
282 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
283 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
284 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
285
286 : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
287 [vqs0_r] "+r"(vqs0_r)
288 :);
289
290 vqs0 = vqs0_l | vqs0_r;
291 vqs1 = vqs1_l | vqs1_r;
292 vps0 = vps0_l | vps0_r;
293 vps1 = vps1_l | vps1_r;
294
295 *ps0 = vps0 ^ N128;
296 *ps1 = vps1 ^ N128;
297 *qs0 = vqs0 ^ N128;
298 *qs1 = vqs1 ^ N128;
299 }
300
vp8_loop_filter_horizontal_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)301 void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p,
302 unsigned int flimit,
303 unsigned int limit,
304 unsigned int thresh, int count) {
305 uint32_t mask;
306 uint32_t hev;
307 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
308 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
309 (void)count;
310
311 mask = 0;
312 hev = 0;
313 p1 = 0;
314 p2 = 0;
315 p3 = 0;
316 p4 = 0;
317
318 /* prefetch data for store */
319 prefetch_store_lf(s);
320
321 /* loop filter designed to work using chars so that we can make maximum use
322 * of 8 bit simd instructions.
323 */
324
325 sm1 = s - (p << 2);
326 s0 = s - p - p - p;
327 s1 = s - p - p;
328 s2 = s - p;
329 s3 = s;
330 s4 = s + p;
331 s5 = s + p + p;
332 s6 = s + p + p + p;
333
334 /* load quad-byte vectors
335 * memory is 4 byte aligned
336 */
337 p1 = *((uint32_t *)(s1));
338 p2 = *((uint32_t *)(s2));
339 p3 = *((uint32_t *)(s3));
340 p4 = *((uint32_t *)(s4));
341
342 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
343 * mask will be zero and filtering is not needed
344 */
345 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
346 pm1 = *((uint32_t *)(sm1));
347 p0 = *((uint32_t *)(s0));
348 p5 = *((uint32_t *)(s5));
349 p6 = *((uint32_t *)(s6));
350
351 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
352 thresh, &hev, &mask);
353
354 /* if mask == 0 do filtering is not needed */
355 if (mask) {
356 /* filtering */
357 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
358
359 /* unpack processed 4x4 neighborhood */
360 *((uint32_t *)s1) = p1;
361 *((uint32_t *)s2) = p2;
362 *((uint32_t *)s3) = p3;
363 *((uint32_t *)s4) = p4;
364 }
365 }
366
367 sm1 += 4;
368 s0 += 4;
369 s1 += 4;
370 s2 += 4;
371 s3 += 4;
372 s4 += 4;
373 s5 += 4;
374 s6 += 4;
375
376 /* load quad-byte vectors
377 * memory is 4 byte aligned
378 */
379 p1 = *((uint32_t *)(s1));
380 p2 = *((uint32_t *)(s2));
381 p3 = *((uint32_t *)(s3));
382 p4 = *((uint32_t *)(s4));
383
384 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
385 * mask will be zero and filtering is not needed
386 */
387 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
388 pm1 = *((uint32_t *)(sm1));
389 p0 = *((uint32_t *)(s0));
390 p5 = *((uint32_t *)(s5));
391 p6 = *((uint32_t *)(s6));
392
393 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
394 thresh, &hev, &mask);
395
396 /* if mask == 0 do filtering is not needed */
397 if (mask) {
398 /* filtering */
399 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
400
401 /* unpack processed 4x4 neighborhood */
402 *((uint32_t *)s1) = p1;
403 *((uint32_t *)s2) = p2;
404 *((uint32_t *)s3) = p3;
405 *((uint32_t *)s4) = p4;
406 }
407 }
408
409 sm1 += 4;
410 s0 += 4;
411 s1 += 4;
412 s2 += 4;
413 s3 += 4;
414 s4 += 4;
415 s5 += 4;
416 s6 += 4;
417
418 /* load quad-byte vectors
419 * memory is 4 byte aligned
420 */
421 p1 = *((uint32_t *)(s1));
422 p2 = *((uint32_t *)(s2));
423 p3 = *((uint32_t *)(s3));
424 p4 = *((uint32_t *)(s4));
425
426 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
427 * mask will be zero and filtering is not needed
428 */
429 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
430 pm1 = *((uint32_t *)(sm1));
431 p0 = *((uint32_t *)(s0));
432 p5 = *((uint32_t *)(s5));
433 p6 = *((uint32_t *)(s6));
434
435 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
436 thresh, &hev, &mask);
437
438 /* if mask == 0 do filtering is not needed */
439 if (mask) {
440 /* filtering */
441 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
442
443 /* unpack processed 4x4 neighborhood */
444 *((uint32_t *)s1) = p1;
445 *((uint32_t *)s2) = p2;
446 *((uint32_t *)s3) = p3;
447 *((uint32_t *)s4) = p4;
448 }
449 }
450
451 sm1 += 4;
452 s0 += 4;
453 s1 += 4;
454 s2 += 4;
455 s3 += 4;
456 s4 += 4;
457 s5 += 4;
458 s6 += 4;
459
460 /* load quad-byte vectors
461 * memory is 4 byte aligned
462 */
463 p1 = *((uint32_t *)(s1));
464 p2 = *((uint32_t *)(s2));
465 p3 = *((uint32_t *)(s3));
466 p4 = *((uint32_t *)(s4));
467
468 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
469 * mask will be zero and filtering is not needed
470 */
471 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
472 pm1 = *((uint32_t *)(sm1));
473 p0 = *((uint32_t *)(s0));
474 p5 = *((uint32_t *)(s5));
475 p6 = *((uint32_t *)(s6));
476
477 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
478 thresh, &hev, &mask);
479
480 /* if mask == 0 do filtering is not needed */
481 if (mask) {
482 /* filtering */
483 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
484
485 /* unpack processed 4x4 neighborhood */
486 *((uint32_t *)s1) = p1;
487 *((uint32_t *)s2) = p2;
488 *((uint32_t *)s3) = p3;
489 *((uint32_t *)s4) = p4;
490 }
491 }
492 }
493
vp8_loop_filter_uvhorizontal_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)494 void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
495 unsigned int flimit,
496 unsigned int limit,
497 unsigned int thresh, int count) {
498 uint32_t mask;
499 uint32_t hev;
500 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
501 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
502 (void)count;
503
504 mask = 0;
505 hev = 0;
506 p1 = 0;
507 p2 = 0;
508 p3 = 0;
509 p4 = 0;
510
511 /* loop filter designed to work using chars so that we can make maximum use
512 * of 8 bit simd instructions.
513 */
514
515 sm1 = s - (p << 2);
516 s0 = s - p - p - p;
517 s1 = s - p - p;
518 s2 = s - p;
519 s3 = s;
520 s4 = s + p;
521 s5 = s + p + p;
522 s6 = s + p + p + p;
523
524 /* load quad-byte vectors
525 * memory is 4 byte aligned
526 */
527 p1 = *((uint32_t *)(s1));
528 p2 = *((uint32_t *)(s2));
529 p3 = *((uint32_t *)(s3));
530 p4 = *((uint32_t *)(s4));
531
532 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
533 * mask will be zero and filtering is not needed
534 */
535 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
536 pm1 = *((uint32_t *)(sm1));
537 p0 = *((uint32_t *)(s0));
538 p5 = *((uint32_t *)(s5));
539 p6 = *((uint32_t *)(s6));
540
541 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
542 thresh, &hev, &mask);
543
544 /* if mask == 0 do filtering is not needed */
545 if (mask) {
546 /* filtering */
547 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
548
549 /* unpack processed 4x4 neighborhood */
550 *((uint32_t *)s1) = p1;
551 *((uint32_t *)s2) = p2;
552 *((uint32_t *)s3) = p3;
553 *((uint32_t *)s4) = p4;
554 }
555 }
556
557 sm1 += 4;
558 s0 += 4;
559 s1 += 4;
560 s2 += 4;
561 s3 += 4;
562 s4 += 4;
563 s5 += 4;
564 s6 += 4;
565
566 /* load quad-byte vectors
567 * memory is 4 byte aligned
568 */
569 p1 = *((uint32_t *)(s1));
570 p2 = *((uint32_t *)(s2));
571 p3 = *((uint32_t *)(s3));
572 p4 = *((uint32_t *)(s4));
573
574 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
575 * mask will be zero and filtering is not needed
576 */
577 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
578 pm1 = *((uint32_t *)(sm1));
579 p0 = *((uint32_t *)(s0));
580 p5 = *((uint32_t *)(s5));
581 p6 = *((uint32_t *)(s6));
582
583 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
584 thresh, &hev, &mask);
585
586 /* if mask == 0 do filtering is not needed */
587 if (mask) {
588 /* filtering */
589 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
590
591 /* unpack processed 4x4 neighborhood */
592 *((uint32_t *)s1) = p1;
593 *((uint32_t *)s2) = p2;
594 *((uint32_t *)s3) = p3;
595 *((uint32_t *)s4) = p4;
596 }
597 }
598 }
599
vp8_loop_filter_vertical_edge_mips(unsigned char * s,int p,const unsigned int flimit,const unsigned int limit,const unsigned int thresh,int count)600 void vp8_loop_filter_vertical_edge_mips(unsigned char *s, int p,
601 const unsigned int flimit,
602 const unsigned int limit,
603 const unsigned int thresh, int count) {
604 int i;
605 uint32_t mask, hev;
606 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
607 unsigned char *s1, *s2, *s3, *s4;
608 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
609
610 hev = 0;
611 mask = 0;
612 i = 0;
613 pm1 = 0;
614 p0 = 0;
615 p1 = 0;
616 p2 = 0;
617 p3 = 0;
618 p4 = 0;
619 p5 = 0;
620 p6 = 0;
621
622 /* loop filter designed to work using chars so that we can make maximum use
623 * of 8 bit simd instructions.
624 */
625
626 /* apply filter on 4 pixesl at the same time */
627 do {
628 /* prefetch data for store */
629 prefetch_store_lf(s + p);
630
631 s1 = s;
632 s2 = s + p;
633 s3 = s2 + p;
634 s4 = s3 + p;
635 s = s4 + p;
636
637 /* load quad-byte vectors
638 * memory is 4 byte aligned
639 */
640 p2 = *((uint32_t *)(s1 - 4));
641 p6 = *((uint32_t *)(s1));
642 p1 = *((uint32_t *)(s2 - 4));
643 p5 = *((uint32_t *)(s2));
644 p0 = *((uint32_t *)(s3 - 4));
645 p4 = *((uint32_t *)(s3));
646 pm1 = *((uint32_t *)(s4 - 4));
647 p3 = *((uint32_t *)(s4));
648
649 /* transpose pm1, p0, p1, p2 */
650 __asm__ __volatile__(
651 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
652 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
653 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
654 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
655
656 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
657 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
658 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
659 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
660
661 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
662 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
663 "append %[p1], %[sec3], 16 \n\t"
664 "append %[pm1], %[sec4], 16 \n\t"
665
666 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
667 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
668 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
669 :);
670
671 /* transpose p3, p4, p5, p6 */
672 __asm__ __volatile__(
673 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
674 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
675 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
676 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
677
678 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
679 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
680 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
681 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
682
683 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
684 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
685 "append %[p5], %[sec3], 16 \n\t"
686 "append %[p3], %[sec4], 16 \n\t"
687
688 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
689 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
690 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
691 :);
692
693 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
694 * mask will be zero and filtering is not needed
695 */
696 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
697 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
698 thresh, &hev, &mask);
699
700 /* if mask == 0 do filtering is not needed */
701 if (mask) {
702 /* filtering */
703 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
704
705 /* unpack processed 4x4 neighborhood
706 * don't use transpose on output data
707 * because memory isn't aligned
708 */
709 __asm__ __volatile__(
710 "sb %[p4], 1(%[s4]) \n\t"
711 "sb %[p3], 0(%[s4]) \n\t"
712 "sb %[p2], -1(%[s4]) \n\t"
713 "sb %[p1], -2(%[s4]) \n\t"
714 :
715 : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2),
716 [p1] "r"(p1));
717
718 __asm__ __volatile__(
719 "srl %[p4], %[p4], 8 \n\t"
720 "srl %[p3], %[p3], 8 \n\t"
721 "srl %[p2], %[p2], 8 \n\t"
722 "srl %[p1], %[p1], 8 \n\t"
723 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
724 :);
725
726 __asm__ __volatile__(
727 "sb %[p4], 1(%[s3]) \n\t"
728 "sb %[p3], 0(%[s3]) \n\t"
729 "sb %[p2], -1(%[s3]) \n\t"
730 "sb %[p1], -2(%[s3]) \n\t"
731 : [p1] "+r"(p1)
732 : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
733
734 __asm__ __volatile__(
735 "srl %[p4], %[p4], 8 \n\t"
736 "srl %[p3], %[p3], 8 \n\t"
737 "srl %[p2], %[p2], 8 \n\t"
738 "srl %[p1], %[p1], 8 \n\t"
739 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
740 :);
741
742 __asm__ __volatile__(
743 "sb %[p4], 1(%[s2]) \n\t"
744 "sb %[p3], 0(%[s2]) \n\t"
745 "sb %[p2], -1(%[s2]) \n\t"
746 "sb %[p1], -2(%[s2]) \n\t"
747 :
748 : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2),
749 [p1] "r"(p1));
750
751 __asm__ __volatile__(
752 "srl %[p4], %[p4], 8 \n\t"
753 "srl %[p3], %[p3], 8 \n\t"
754 "srl %[p2], %[p2], 8 \n\t"
755 "srl %[p1], %[p1], 8 \n\t"
756 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
757 :);
758
759 __asm__ __volatile__(
760 "sb %[p4], 1(%[s1]) \n\t"
761 "sb %[p3], 0(%[s1]) \n\t"
762 "sb %[p2], -1(%[s1]) \n\t"
763 "sb %[p1], -2(%[s1]) \n\t"
764 :
765 : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2),
766 [p1] "r"(p1));
767 }
768 }
769
770 s1 = s;
771 s2 = s + p;
772 s3 = s2 + p;
773 s4 = s3 + p;
774 s = s4 + p;
775
776 /* load quad-byte vectors
777 * memory is 4 byte aligned
778 */
779 p2 = *((uint32_t *)(s1 - 4));
780 p6 = *((uint32_t *)(s1));
781 p1 = *((uint32_t *)(s2 - 4));
782 p5 = *((uint32_t *)(s2));
783 p0 = *((uint32_t *)(s3 - 4));
784 p4 = *((uint32_t *)(s3));
785 pm1 = *((uint32_t *)(s4 - 4));
786 p3 = *((uint32_t *)(s4));
787
788 /* transpose pm1, p0, p1, p2 */
789 __asm__ __volatile__(
790 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
791 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
792 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
793 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
794
795 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
796 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
797 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
798 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
799
800 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
801 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
802 "append %[p1], %[sec3], 16 \n\t"
803 "append %[pm1], %[sec4], 16 \n\t"
804
805 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
806 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
807 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
808 :);
809
810 /* transpose p3, p4, p5, p6 */
811 __asm__ __volatile__(
812 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
813 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
814 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
815 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
816
817 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
818 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
819 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
820 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
821
822 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
823 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
824 "append %[p5], %[sec3], 16 \n\t"
825 "append %[p3], %[sec4], 16 \n\t"
826
827 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
828 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
829 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
830 :);
831
832 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
833 * mask will be zero and filtering is not needed
834 */
835 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
836 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
837 thresh, &hev, &mask);
838
839 /* if mask == 0 do filtering is not needed */
840 if (mask) {
841 /* filtering */
842 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
843
844 /* unpack processed 4x4 neighborhood
845 * don't use transpose on output data
846 * because memory isn't aligned
847 */
848 __asm__ __volatile__(
849 "sb %[p4], 1(%[s4]) \n\t"
850 "sb %[p3], 0(%[s4]) \n\t"
851 "sb %[p2], -1(%[s4]) \n\t"
852 "sb %[p1], -2(%[s4]) \n\t"
853 :
854 : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2),
855 [p1] "r"(p1));
856
857 __asm__ __volatile__(
858 "srl %[p4], %[p4], 8 \n\t"
859 "srl %[p3], %[p3], 8 \n\t"
860 "srl %[p2], %[p2], 8 \n\t"
861 "srl %[p1], %[p1], 8 \n\t"
862 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
863 :);
864
865 __asm__ __volatile__(
866 "sb %[p4], 1(%[s3]) \n\t"
867 "sb %[p3], 0(%[s3]) \n\t"
868 "sb %[p2], -1(%[s3]) \n\t"
869 "sb %[p1], -2(%[s3]) \n\t"
870 : [p1] "+r"(p1)
871 : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
872
873 __asm__ __volatile__(
874 "srl %[p4], %[p4], 8 \n\t"
875 "srl %[p3], %[p3], 8 \n\t"
876 "srl %[p2], %[p2], 8 \n\t"
877 "srl %[p1], %[p1], 8 \n\t"
878 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
879 :);
880
881 __asm__ __volatile__(
882 "sb %[p4], 1(%[s2]) \n\t"
883 "sb %[p3], 0(%[s2]) \n\t"
884 "sb %[p2], -1(%[s2]) \n\t"
885 "sb %[p1], -2(%[s2]) \n\t"
886 :
887 : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2),
888 [p1] "r"(p1));
889
890 __asm__ __volatile__(
891 "srl %[p4], %[p4], 8 \n\t"
892 "srl %[p3], %[p3], 8 \n\t"
893 "srl %[p2], %[p2], 8 \n\t"
894 "srl %[p1], %[p1], 8 \n\t"
895 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
896 :);
897
898 __asm__ __volatile__(
899 "sb %[p4], 1(%[s1]) \n\t"
900 "sb %[p3], 0(%[s1]) \n\t"
901 "sb %[p2], -1(%[s1]) \n\t"
902 "sb %[p1], -2(%[s1]) \n\t"
903 :
904 : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2),
905 [p1] "r"(p1));
906 }
907 }
908
909 i += 8;
910 }
911
912 while (i < count);
913 }
914
vp8_loop_filter_uvvertical_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)915 void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
916 unsigned int flimit,
917 unsigned int limit,
918 unsigned int thresh, int count) {
919 uint32_t mask, hev;
920 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
921 unsigned char *s1, *s2, *s3, *s4;
922 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
923 (void)count;
924
925 /* loop filter designed to work using chars so that we can make maximum use
926 * of 8 bit simd instructions.
927 */
928
929 /* apply filter on 4 pixesl at the same time */
930
931 s1 = s;
932 s2 = s + p;
933 s3 = s2 + p;
934 s4 = s3 + p;
935
936 /* load quad-byte vectors
937 * memory is 4 byte aligned
938 */
939 p2 = *((uint32_t *)(s1 - 4));
940 p6 = *((uint32_t *)(s1));
941 p1 = *((uint32_t *)(s2 - 4));
942 p5 = *((uint32_t *)(s2));
943 p0 = *((uint32_t *)(s3 - 4));
944 p4 = *((uint32_t *)(s3));
945 pm1 = *((uint32_t *)(s4 - 4));
946 p3 = *((uint32_t *)(s4));
947
948 /* transpose pm1, p0, p1, p2 */
949 __asm__ __volatile__(
950 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
951 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
952 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
953 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
954
955 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
956 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
957 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
958 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
959
960 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
961 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
962 "append %[p1], %[sec3], 16 \n\t"
963 "append %[pm1], %[sec4], 16 \n\t"
964
965 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
966 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
967 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
968 :);
969
970 /* transpose p3, p4, p5, p6 */
971 __asm__ __volatile__(
972 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
973 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
974 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
975 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
976
977 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
978 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
979 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
980 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
981
982 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
983 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
984 "append %[p5], %[sec3], 16 \n\t"
985 "append %[p3], %[sec4], 16 \n\t"
986
987 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
988 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
989 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
990 :);
991
992 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
993 * mask will be zero and filtering is not needed
994 */
995 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
996 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
997 thresh, &hev, &mask);
998
999 /* if mask == 0 do filtering is not needed */
1000 if (mask) {
1001 /* filtering */
1002 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
1003
1004 /* unpack processed 4x4 neighborhood
1005 * don't use transpose on output data
1006 * because memory isn't aligned
1007 */
1008 __asm__ __volatile__(
1009 "sb %[p4], 1(%[s4]) \n\t"
1010 "sb %[p3], 0(%[s4]) \n\t"
1011 "sb %[p2], -1(%[s4]) \n\t"
1012 "sb %[p1], -2(%[s4]) \n\t"
1013 :
1014 :
1015 [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1));
1016
1017 __asm__ __volatile__(
1018 "srl %[p4], %[p4], 8 \n\t"
1019 "srl %[p3], %[p3], 8 \n\t"
1020 "srl %[p2], %[p2], 8 \n\t"
1021 "srl %[p1], %[p1], 8 \n\t"
1022 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1023 :);
1024
1025 __asm__ __volatile__(
1026 "sb %[p4], 1(%[s3]) \n\t"
1027 "sb %[p3], 0(%[s3]) \n\t"
1028 "sb %[p2], -1(%[s3]) \n\t"
1029 "sb %[p1], -2(%[s3]) \n\t"
1030 : [p1] "+r"(p1)
1031 : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
1032
1033 __asm__ __volatile__(
1034 "srl %[p4], %[p4], 8 \n\t"
1035 "srl %[p3], %[p3], 8 \n\t"
1036 "srl %[p2], %[p2], 8 \n\t"
1037 "srl %[p1], %[p1], 8 \n\t"
1038 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1039 :);
1040
1041 __asm__ __volatile__(
1042 "sb %[p4], 1(%[s2]) \n\t"
1043 "sb %[p3], 0(%[s2]) \n\t"
1044 "sb %[p2], -1(%[s2]) \n\t"
1045 "sb %[p1], -2(%[s2]) \n\t"
1046 :
1047 :
1048 [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1));
1049
1050 __asm__ __volatile__(
1051 "srl %[p4], %[p4], 8 \n\t"
1052 "srl %[p3], %[p3], 8 \n\t"
1053 "srl %[p2], %[p2], 8 \n\t"
1054 "srl %[p1], %[p1], 8 \n\t"
1055 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1056 :);
1057
1058 __asm__ __volatile__(
1059 "sb %[p4], 1(%[s1]) \n\t"
1060 "sb %[p3], 0(%[s1]) \n\t"
1061 "sb %[p2], -1(%[s1]) \n\t"
1062 "sb %[p1], -2(%[s1]) \n\t"
1063 :
1064 :
1065 [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1));
1066 }
1067 }
1068
1069 s1 = s4 + p;
1070 s2 = s1 + p;
1071 s3 = s2 + p;
1072 s4 = s3 + p;
1073
1074 /* load quad-byte vectors
1075 * memory is 4 byte aligned
1076 */
1077 p2 = *((uint32_t *)(s1 - 4));
1078 p6 = *((uint32_t *)(s1));
1079 p1 = *((uint32_t *)(s2 - 4));
1080 p5 = *((uint32_t *)(s2));
1081 p0 = *((uint32_t *)(s3 - 4));
1082 p4 = *((uint32_t *)(s3));
1083 pm1 = *((uint32_t *)(s4 - 4));
1084 p3 = *((uint32_t *)(s4));
1085
1086 /* transpose pm1, p0, p1, p2 */
1087 __asm__ __volatile__(
1088 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
1089 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
1090 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
1091 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
1092
1093 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
1094 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
1095 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1096 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1097
1098 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
1099 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
1100 "append %[p1], %[sec3], 16 \n\t"
1101 "append %[pm1], %[sec4], 16 \n\t"
1102
1103 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1104 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
1105 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1106 :);
1107
1108 /* transpose p3, p4, p5, p6 */
1109 __asm__ __volatile__(
1110 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
1111 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
1112 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
1113 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
1114
1115 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
1116 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
1117 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1118 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1119
1120 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
1121 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
1122 "append %[p5], %[sec3], 16 \n\t"
1123 "append %[p3], %[sec4], 16 \n\t"
1124
1125 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1126 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
1127 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1128 :);
1129
1130 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1131 * mask will be zero and filtering is not needed
1132 */
1133 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1134 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1135 thresh, &hev, &mask);
1136
1137 /* if mask == 0 do filtering is not needed */
1138 if (mask) {
1139 /* filtering */
1140 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
1141
1142 /* unpack processed 4x4 neighborhood
1143 * don't use transpose on output data
1144 * because memory isn't aligned
1145 */
1146 __asm__ __volatile__(
1147 "sb %[p4], 1(%[s4]) \n\t"
1148 "sb %[p3], 0(%[s4]) \n\t"
1149 "sb %[p2], -1(%[s4]) \n\t"
1150 "sb %[p1], -2(%[s4]) \n\t"
1151 :
1152 :
1153 [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1));
1154
1155 __asm__ __volatile__(
1156 "srl %[p4], %[p4], 8 \n\t"
1157 "srl %[p3], %[p3], 8 \n\t"
1158 "srl %[p2], %[p2], 8 \n\t"
1159 "srl %[p1], %[p1], 8 \n\t"
1160 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1161 :);
1162
1163 __asm__ __volatile__(
1164 "sb %[p4], 1(%[s3]) \n\t"
1165 "sb %[p3], 0(%[s3]) \n\t"
1166 "sb %[p2], -1(%[s3]) \n\t"
1167 "sb %[p1], -2(%[s3]) \n\t"
1168 : [p1] "+r"(p1)
1169 : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
1170
1171 __asm__ __volatile__(
1172 "srl %[p4], %[p4], 8 \n\t"
1173 "srl %[p3], %[p3], 8 \n\t"
1174 "srl %[p2], %[p2], 8 \n\t"
1175 "srl %[p1], %[p1], 8 \n\t"
1176 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1177 :);
1178
1179 __asm__ __volatile__(
1180 "sb %[p4], 1(%[s2]) \n\t"
1181 "sb %[p3], 0(%[s2]) \n\t"
1182 "sb %[p2], -1(%[s2]) \n\t"
1183 "sb %[p1], -2(%[s2]) \n\t"
1184 :
1185 :
1186 [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1));
1187
1188 __asm__ __volatile__(
1189 "srl %[p4], %[p4], 8 \n\t"
1190 "srl %[p3], %[p3], 8 \n\t"
1191 "srl %[p2], %[p2], 8 \n\t"
1192 "srl %[p1], %[p1], 8 \n\t"
1193 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
1194 :);
1195
1196 __asm__ __volatile__(
1197 "sb %[p4], 1(%[s1]) \n\t"
1198 "sb %[p3], 0(%[s1]) \n\t"
1199 "sb %[p2], -1(%[s1]) \n\t"
1200 "sb %[p1], -2(%[s1]) \n\t"
1201 :
1202 :
1203 [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1));
1204 }
1205 }
1206 }
1207
1208 /* inputs & outputs are quad-byte vectors */
vp8_mbfilter_mips(uint32_t mask,uint32_t hev,uint32_t * ps2,uint32_t * ps1,uint32_t * ps0,uint32_t * qs0,uint32_t * qs1,uint32_t * qs2)1209 static __inline void vp8_mbfilter_mips(uint32_t mask, uint32_t hev,
1210 uint32_t *ps2, uint32_t *ps1,
1211 uint32_t *ps0, uint32_t *qs0,
1212 uint32_t *qs1, uint32_t *qs2) {
1213 int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
1214 int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
1215 int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
1216 uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r,
1217 subr_r, subr_l;
1218 uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l,
1219 invhev_r;
1220 uint32_t N128, R63;
1221 uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
1222
1223 R63 = 0x003F003F;
1224 HWM = 0xFF00FF00;
1225 N128 = 0x80808080;
1226 t1 = 0x03000300;
1227 t2 = 0x04000400;
1228
1229 vps0 = (*ps0) ^ N128;
1230 vps1 = (*ps1) ^ N128;
1231 vps2 = (*ps2) ^ N128;
1232 vqs0 = (*qs0) ^ N128;
1233 vqs1 = (*qs1) ^ N128;
1234 vqs2 = (*qs2) ^ N128;
1235
1236 /* use halfword pairs instead quad-bytes because of accuracy */
1237 vps0_l = vps0 & HWM;
1238 vps0_r = vps0 << 8;
1239 vps0_r = vps0_r & HWM;
1240
1241 vqs0_l = vqs0 & HWM;
1242 vqs0_r = vqs0 << 8;
1243 vqs0_r = vqs0_r & HWM;
1244
1245 vps1_l = vps1 & HWM;
1246 vps1_r = vps1 << 8;
1247 vps1_r = vps1_r & HWM;
1248
1249 vqs1_l = vqs1 & HWM;
1250 vqs1_r = vqs1 << 8;
1251 vqs1_r = vqs1_r & HWM;
1252
1253 vqs2_l = vqs2 & HWM;
1254 vqs2_r = vqs2 << 8;
1255 vqs2_r = vqs2_r & HWM;
1256
1257 __asm__ __volatile__(
1258 /* qs0 - ps0 */
1259 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
1260 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
1261
1262 /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
1263 "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t"
1264 "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t"
1265
1266 : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=r"(vp8_filter_r),
1267 [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r)
1268 : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
1269 [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
1270 [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r));
1271
1272 vps2_l = vps2 & HWM;
1273 vps2_r = vps2 << 8;
1274 vps2_r = vps2_r & HWM;
1275
1276 /* add outer taps if we have high edge variance */
1277 __asm__ __volatile__(
1278 /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
1279 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
1280 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
1281 "and %[mask_l], %[HWM], %[mask] \n\t"
1282 "sll %[mask_r], %[mask], 8 \n\t"
1283 "and %[mask_r], %[HWM], %[mask_r] \n\t"
1284 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
1285 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
1286 "and %[hev_l], %[HWM], %[hev] \n\t"
1287 "sll %[hev_r], %[hev], 8 \n\t"
1288 "and %[hev_r], %[HWM], %[hev_r] \n\t"
1289 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t"
1290 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t"
1291
1292 /* vp8_filter &= mask; */
1293 "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t"
1294 "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t"
1295
1296 /* Filter2 = vp8_filter & hev; */
1297 "and %[Filter2_l], %[vp8_filter_l], %[hev_l] \n\t"
1298 "and %[Filter2_r], %[vp8_filter_r], %[hev_r] \n\t"
1299
1300 : [vp8_filter_l] "+r"(vp8_filter_l), [vp8_filter_r] "+r"(vp8_filter_r),
1301 [hev_l] "=&r"(hev_l), [hev_r] "=&r"(hev_r), [mask_l] "=&r"(mask_l),
1302 [mask_r] "=&r"(mask_r), [Filter2_l] "=&r"(Filter2_l),
1303 [Filter2_r] "=&r"(Filter2_r)
1304 : [subr_l] "r"(subr_l), [subr_r] "r"(subr_r), [HWM] "r"(HWM),
1305 [hev] "r"(hev), [mask] "r"(mask));
1306
1307 /* save bottom 3 bits so that we round one side +4 and the other +3 */
1308 __asm__ __volatile__(
1309 /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
1310 "addq_s.ph %[Filter1_l], %[Filter2_l], %[t2] \n\t"
1311 "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
1312 "addq_s.ph %[Filter1_r], %[Filter2_r], %[t2] \n\t"
1313
1314 /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
1315 "addq_s.ph %[Filter2_l], %[Filter2_l], %[t1] \n\t"
1316 "addq_s.ph %[Filter2_r], %[Filter2_r], %[t1] \n\t"
1317
1318 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
1319 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
1320
1321 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
1322 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
1323 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
1324 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
1325 "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
1326
1327 /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
1328 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
1329 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
1330
1331 /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
1332 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
1333 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
1334
1335 : [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r),
1336 [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
1337 [Filter2_l] "+r"(Filter2_l), [Filter2_r] "+r"(Filter2_r),
1338 [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
1339 [vqs0_r] "+r"(vqs0_r)
1340 : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), [hev_l] "r"(hev_l),
1341 [hev_r] "r"(hev_r));
1342
1343 /* only apply wider filter if not high edge variance */
1344 __asm__ __volatile__(
1345 /* vp8_filter &= ~hev; */
1346 "and %[Filter2_l], %[vp8_filter_l], %[invhev_l] \n\t"
1347 "and %[Filter2_r], %[vp8_filter_r], %[invhev_r] \n\t"
1348
1349 "shra.ph %[Filter2_l], %[Filter2_l], 8 \n\t"
1350 "shra.ph %[Filter2_r], %[Filter2_r], 8 \n\t"
1351
1352 : [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r)
1353 : [vp8_filter_l] "r"(vp8_filter_l), [vp8_filter_r] "r"(vp8_filter_r),
1354 [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
1355
1356 /* roughly 3/7th difference across boundary */
1357 __asm__ __volatile__(
1358 "shll.ph %[u3_l], %[Filter2_l], 3 \n\t"
1359 "shll.ph %[u3_r], %[Filter2_r], 3 \n\t"
1360
1361 "addq.ph %[u3_l], %[u3_l], %[Filter2_l] \n\t"
1362 "addq.ph %[u3_r], %[u3_r], %[Filter2_r] \n\t"
1363
1364 "shll.ph %[u2_l], %[u3_l], 1 \n\t"
1365 "shll.ph %[u2_r], %[u3_r], 1 \n\t"
1366
1367 "addq.ph %[u1_l], %[u3_l], %[u2_l] \n\t"
1368 "addq.ph %[u1_r], %[u3_r], %[u2_r] \n\t"
1369
1370 "addq.ph %[u2_l], %[u2_l], %[R63] \n\t"
1371 "addq.ph %[u2_r], %[u2_r], %[R63] \n\t"
1372
1373 "addq.ph %[u3_l], %[u3_l], %[R63] \n\t"
1374 "addq.ph %[u3_r], %[u3_r], %[R63] \n\t"
1375
1376 /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
1377 * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
1378 */
1379 "addq.ph %[u1_l], %[u1_l], %[R63] \n\t"
1380 "addq.ph %[u1_r], %[u1_r], %[R63] \n\t"
1381 "shra.ph %[u1_l], %[u1_l], 7 \n\t"
1382 "shra.ph %[u1_r], %[u1_r], 7 \n\t"
1383 "shra.ph %[u2_l], %[u2_l], 7 \n\t"
1384 "shra.ph %[u2_r], %[u2_r], 7 \n\t"
1385 "shll.ph %[u1_l], %[u1_l], 8 \n\t"
1386 "shll.ph %[u1_r], %[u1_r], 8 \n\t"
1387 "shll.ph %[u2_l], %[u2_l], 8 \n\t"
1388 "shll.ph %[u2_r], %[u2_r], 8 \n\t"
1389
1390 /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
1391 "subq_s.ph %[vqs0_l], %[vqs0_l], %[u1_l] \n\t"
1392 "subq_s.ph %[vqs0_r], %[vqs0_r], %[u1_r] \n\t"
1393
1394 /* vps0 = vp8_signed_char_clamp(ps0 + u); */
1395 "addq_s.ph %[vps0_l], %[vps0_l], %[u1_l] \n\t"
1396 "addq_s.ph %[vps0_r], %[vps0_r], %[u1_r] \n\t"
1397
1398 : [u1_l] "=&r"(u1_l), [u1_r] "=&r"(u1_r), [u2_l] "=&r"(u2_l),
1399 [u2_r] "=&r"(u2_r), [u3_l] "=&r"(u3_l), [u3_r] "=&r"(u3_r),
1400 [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
1401 [vqs0_r] "+r"(vqs0_r)
1402 : [R63] "r"(R63), [Filter2_l] "r"(Filter2_l), [Filter2_r] "r"(Filter2_r));
1403
1404 __asm__ __volatile__(
1405 /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
1406 "subq_s.ph %[vqs1_l], %[vqs1_l], %[u2_l] \n\t"
1407 "addq_s.ph %[vps1_l], %[vps1_l], %[u2_l] \n\t"
1408
1409 /* vps1 = vp8_signed_char_clamp(ps1 + u); */
1410 "addq_s.ph %[vps1_r], %[vps1_r], %[u2_r] \n\t"
1411 "subq_s.ph %[vqs1_r], %[vqs1_r], %[u2_r] \n\t"
1412
1413 : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
1414 [vqs1_r] "+r"(vqs1_r)
1415 : [u2_l] "r"(u2_l), [u2_r] "r"(u2_r));
1416
1417 /* roughly 1/7th difference across boundary */
1418 __asm__ __volatile__(
1419 /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
1420 "shra.ph %[u3_l], %[u3_l], 7 \n\t"
1421 "shra.ph %[u3_r], %[u3_r], 7 \n\t"
1422 "shll.ph %[u3_l], %[u3_l], 8 \n\t"
1423 "shll.ph %[u3_r], %[u3_r], 8 \n\t"
1424
1425 /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
1426 "subq_s.ph %[vqs2_l], %[vqs2_l], %[u3_l] \n\t"
1427 "subq_s.ph %[vqs2_r], %[vqs2_r], %[u3_r] \n\t"
1428
1429 /* vps2 = vp8_signed_char_clamp(ps2 + u); */
1430 "addq_s.ph %[vps2_l], %[vps2_l], %[u3_l] \n\t"
1431 "addq_s.ph %[vps2_r], %[vps2_r], %[u3_r] \n\t"
1432
1433 : [u3_l] "+r"(u3_l), [u3_r] "+r"(u3_r), [vps2_l] "+r"(vps2_l),
1434 [vps2_r] "+r"(vps2_r), [vqs2_l] "+r"(vqs2_l), [vqs2_r] "+r"(vqs2_r)
1435 :);
1436
1437 /* Create quad-bytes from halfword pairs */
1438 __asm__ __volatile__(
1439 "and %[vqs0_l], %[vqs0_l], %[HWM] \n\t"
1440 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
1441
1442 "and %[vps0_l], %[vps0_l], %[HWM] \n\t"
1443 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
1444
1445 "and %[vqs1_l], %[vqs1_l], %[HWM] \n\t"
1446 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
1447
1448 "and %[vps1_l], %[vps1_l], %[HWM] \n\t"
1449 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
1450
1451 "and %[vqs2_l], %[vqs2_l], %[HWM] \n\t"
1452 "shrl.ph %[vqs2_r], %[vqs2_r], 8 \n\t"
1453
1454 "and %[vps2_l], %[vps2_l], %[HWM] \n\t"
1455 "shrl.ph %[vps2_r], %[vps2_r], 8 \n\t"
1456
1457 "or %[vqs0_r], %[vqs0_l], %[vqs0_r] \n\t"
1458 "or %[vps0_r], %[vps0_l], %[vps0_r] \n\t"
1459 "or %[vqs1_r], %[vqs1_l], %[vqs1_r] \n\t"
1460 "or %[vps1_r], %[vps1_l], %[vps1_r] \n\t"
1461 "or %[vqs2_r], %[vqs2_l], %[vqs2_r] \n\t"
1462 "or %[vps2_r], %[vps2_l], %[vps2_r] \n\t"
1463
1464 : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
1465 [vqs1_r] "+r"(vqs1_r), [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r),
1466 [vqs0_l] "+r"(vqs0_l), [vqs0_r] "+r"(vqs0_r), [vqs2_l] "+r"(vqs2_l),
1467 [vqs2_r] "+r"(vqs2_r), [vps2_r] "+r"(vps2_r), [vps2_l] "+r"(vps2_l)
1468 : [HWM] "r"(HWM));
1469
1470 *ps0 = vps0_r ^ N128;
1471 *ps1 = vps1_r ^ N128;
1472 *ps2 = vps2_r ^ N128;
1473 *qs0 = vqs0_r ^ N128;
1474 *qs1 = vqs1_r ^ N128;
1475 *qs2 = vqs2_r ^ N128;
1476 }
1477
vp8_mbloop_filter_horizontal_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)1478 void vp8_mbloop_filter_horizontal_edge_mips(unsigned char *s, int p,
1479 unsigned int flimit,
1480 unsigned int limit,
1481 unsigned int thresh, int count) {
1482 int i;
1483 uint32_t mask, hev;
1484 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1485 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
1486
1487 mask = 0;
1488 hev = 0;
1489 i = 0;
1490 p1 = 0;
1491 p2 = 0;
1492 p3 = 0;
1493 p4 = 0;
1494
1495 /* loop filter designed to work using chars so that we can make maximum use
1496 * of 8 bit simd instructions.
1497 */
1498
1499 sm1 = s - (p << 2);
1500 s0 = s - p - p - p;
1501 s1 = s - p - p;
1502 s2 = s - p;
1503 s3 = s;
1504 s4 = s + p;
1505 s5 = s + p + p;
1506 s6 = s + p + p + p;
1507
1508 /* prefetch data for load */
1509 prefetch_load_lf(s + p);
1510
1511 /* apply filter on 4 pixesl at the same time */
1512 do {
1513 /* load quad-byte vectors
1514 * memory is 4 byte aligned
1515 */
1516 p1 = *((uint32_t *)(s1));
1517 p2 = *((uint32_t *)(s2));
1518 p3 = *((uint32_t *)(s3));
1519 p4 = *((uint32_t *)(s4));
1520
1521 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1522 * mask will be zero and filtering is not needed
1523 */
1524 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1525 pm1 = *((uint32_t *)(sm1));
1526 p0 = *((uint32_t *)(s0));
1527 p5 = *((uint32_t *)(s5));
1528 p6 = *((uint32_t *)(s6));
1529
1530 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1531 thresh, &hev, &mask);
1532
1533 /* if mask == 0 do filtering is not needed */
1534 if (mask) {
1535 /* filtering */
1536 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1537
1538 /* unpack processed 4x4 neighborhood
1539 * memory is 4 byte aligned
1540 */
1541 *((uint32_t *)s0) = p0;
1542 *((uint32_t *)s1) = p1;
1543 *((uint32_t *)s2) = p2;
1544 *((uint32_t *)s3) = p3;
1545 *((uint32_t *)s4) = p4;
1546 *((uint32_t *)s5) = p5;
1547 }
1548 }
1549
1550 sm1 += 4;
1551 s0 += 4;
1552 s1 += 4;
1553 s2 += 4;
1554 s3 += 4;
1555 s4 += 4;
1556 s5 += 4;
1557 s6 += 4;
1558
1559 /* load quad-byte vectors
1560 * memory is 4 byte aligned
1561 */
1562 p1 = *((uint32_t *)(s1));
1563 p2 = *((uint32_t *)(s2));
1564 p3 = *((uint32_t *)(s3));
1565 p4 = *((uint32_t *)(s4));
1566
1567 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1568 * mask will be zero and filtering is not needed
1569 */
1570 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1571 pm1 = *((uint32_t *)(sm1));
1572 p0 = *((uint32_t *)(s0));
1573 p5 = *((uint32_t *)(s5));
1574 p6 = *((uint32_t *)(s6));
1575
1576 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1577 thresh, &hev, &mask);
1578
1579 /* if mask == 0 do filtering is not needed */
1580 if (mask) {
1581 /* filtering */
1582 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1583
1584 /* unpack processed 4x4 neighborhood
1585 * memory is 4 byte aligned
1586 */
1587 *((uint32_t *)s0) = p0;
1588 *((uint32_t *)s1) = p1;
1589 *((uint32_t *)s2) = p2;
1590 *((uint32_t *)s3) = p3;
1591 *((uint32_t *)s4) = p4;
1592 *((uint32_t *)s5) = p5;
1593 }
1594 }
1595
1596 sm1 += 4;
1597 s0 += 4;
1598 s1 += 4;
1599 s2 += 4;
1600 s3 += 4;
1601 s4 += 4;
1602 s5 += 4;
1603 s6 += 4;
1604
1605 i += 8;
1606 }
1607
1608 while (i < count);
1609 }
1610
vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)1611 void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
1612 unsigned int flimit,
1613 unsigned int limit,
1614 unsigned int thresh, int count) {
1615 uint32_t mask, hev;
1616 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1617 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
1618 (void)count;
1619
1620 mask = 0;
1621 hev = 0;
1622 p1 = 0;
1623 p2 = 0;
1624 p3 = 0;
1625 p4 = 0;
1626
1627 /* loop filter designed to work using chars so that we can make maximum use
1628 * of 8 bit simd instructions.
1629 */
1630
1631 sm1 = s - (p << 2);
1632 s0 = s - p - p - p;
1633 s1 = s - p - p;
1634 s2 = s - p;
1635 s3 = s;
1636 s4 = s + p;
1637 s5 = s + p + p;
1638 s6 = s + p + p + p;
1639
1640 /* load quad-byte vectors
1641 * memory is 4 byte aligned
1642 */
1643 p1 = *((uint32_t *)(s1));
1644 p2 = *((uint32_t *)(s2));
1645 p3 = *((uint32_t *)(s3));
1646 p4 = *((uint32_t *)(s4));
1647
1648 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1649 * mask will be zero and filtering is not needed
1650 */
1651 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1652 pm1 = *((uint32_t *)(sm1));
1653 p0 = *((uint32_t *)(s0));
1654 p5 = *((uint32_t *)(s5));
1655 p6 = *((uint32_t *)(s6));
1656
1657 /* if mask == 0 do filtering is not needed */
1658 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1659 thresh, &hev, &mask);
1660
1661 if (mask) {
1662 /* filtering */
1663 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1664
1665 /* unpack processed 4x4 neighborhood
1666 * memory is 4 byte aligned
1667 */
1668 *((uint32_t *)s0) = p0;
1669 *((uint32_t *)s1) = p1;
1670 *((uint32_t *)s2) = p2;
1671 *((uint32_t *)s3) = p3;
1672 *((uint32_t *)s4) = p4;
1673 *((uint32_t *)s5) = p5;
1674 }
1675 }
1676
1677 sm1 += 4;
1678 s0 += 4;
1679 s1 += 4;
1680 s2 += 4;
1681 s3 += 4;
1682 s4 += 4;
1683 s5 += 4;
1684 s6 += 4;
1685
1686 /* load quad-byte vectors
1687 * memory is 4 byte aligned
1688 */
1689 p1 = *((uint32_t *)(s1));
1690 p2 = *((uint32_t *)(s2));
1691 p3 = *((uint32_t *)(s3));
1692 p4 = *((uint32_t *)(s4));
1693
1694 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1695 * mask will be zero and filtering is not needed
1696 */
1697 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1698 pm1 = *((uint32_t *)(sm1));
1699 p0 = *((uint32_t *)(s0));
1700 p5 = *((uint32_t *)(s5));
1701 p6 = *((uint32_t *)(s6));
1702
1703 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1704 thresh, &hev, &mask);
1705
1706 /* if mask == 0 do filtering is not needed */
1707 if (mask) {
1708 /* filtering */
1709 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1710
1711 /* unpack processed 4x4 neighborhood
1712 * memory is 4 byte aligned
1713 */
1714 *((uint32_t *)s0) = p0;
1715 *((uint32_t *)s1) = p1;
1716 *((uint32_t *)s2) = p2;
1717 *((uint32_t *)s3) = p3;
1718 *((uint32_t *)s4) = p4;
1719 *((uint32_t *)s5) = p5;
1720 }
1721 }
1722 }
1723
vp8_mbloop_filter_vertical_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)1724 void vp8_mbloop_filter_vertical_edge_mips(unsigned char *s, int p,
1725 unsigned int flimit,
1726 unsigned int limit,
1727 unsigned int thresh, int count) {
1728 int i;
1729 uint32_t mask, hev;
1730 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1731 unsigned char *s1, *s2, *s3, *s4;
1732 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1733
1734 mask = 0;
1735 hev = 0;
1736 i = 0;
1737 pm1 = 0;
1738 p0 = 0;
1739 p1 = 0;
1740 p2 = 0;
1741 p3 = 0;
1742 p4 = 0;
1743 p5 = 0;
1744 p6 = 0;
1745
1746 /* loop filter designed to work using chars so that we can make maximum use
1747 * of 8 bit simd instructions.
1748 */
1749
1750 /* apply filter on 4 pixesl at the same time */
1751 do {
1752 s1 = s;
1753 s2 = s + p;
1754 s3 = s2 + p;
1755 s4 = s3 + p;
1756 s = s4 + p;
1757
1758 /* load quad-byte vectors
1759 * memory is 4 byte aligned
1760 */
1761 p2 = *((uint32_t *)(s1 - 4));
1762 p6 = *((uint32_t *)(s1));
1763 p1 = *((uint32_t *)(s2 - 4));
1764 p5 = *((uint32_t *)(s2));
1765 p0 = *((uint32_t *)(s3 - 4));
1766 p4 = *((uint32_t *)(s3));
1767 pm1 = *((uint32_t *)(s4 - 4));
1768 p3 = *((uint32_t *)(s4));
1769
1770 /* transpose pm1, p0, p1, p2 */
1771 __asm__ __volatile__(
1772 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
1773 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
1774 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
1775 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
1776
1777 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
1778 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
1779 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1780 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1781
1782 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
1783 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
1784 "append %[p1], %[sec3], 16 \n\t"
1785 "append %[pm1], %[sec4], 16 \n\t"
1786
1787 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1788 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
1789 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1790 :);
1791
1792 /* transpose p3, p4, p5, p6 */
1793 __asm__ __volatile__(
1794 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
1795 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
1796 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
1797 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
1798
1799 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
1800 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
1801 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1802 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1803
1804 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
1805 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
1806 "append %[p5], %[sec3], 16 \n\t"
1807 "append %[p3], %[sec4], 16 \n\t"
1808
1809 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1810 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
1811 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1812 :);
1813
1814 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1815 * mask will be zero and filtering is not needed
1816 */
1817 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
1818 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
1819 thresh, &hev, &mask);
1820
1821 /* if mask == 0 do filtering is not needed */
1822 if (mask) {
1823 /* filtering */
1824 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
1825
1826 /* don't use transpose on output data
1827 * because memory isn't aligned
1828 */
1829 __asm__ __volatile__(
1830 "sb %[p5], 2(%[s4]) \n\t"
1831 "sb %[p4], 1(%[s4]) \n\t"
1832 "sb %[p3], 0(%[s4]) \n\t"
1833 "sb %[p2], -1(%[s4]) \n\t"
1834 "sb %[p1], -2(%[s4]) \n\t"
1835 "sb %[p0], -3(%[s4]) \n\t"
1836 :
1837 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
1838 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1839
1840 __asm__ __volatile__(
1841 "srl %[p5], %[p5], 8 \n\t"
1842 "srl %[p4], %[p4], 8 \n\t"
1843 "srl %[p3], %[p3], 8 \n\t"
1844 "srl %[p2], %[p2], 8 \n\t"
1845 "srl %[p1], %[p1], 8 \n\t"
1846 "srl %[p0], %[p0], 8 \n\t"
1847 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
1848 [p1] "+r"(p1), [p0] "+r"(p0)
1849 :);
1850
1851 __asm__ __volatile__(
1852 "sb %[p5], 2(%[s3]) \n\t"
1853 "sb %[p4], 1(%[s3]) \n\t"
1854 "sb %[p3], 0(%[s3]) \n\t"
1855 "sb %[p2], -1(%[s3]) \n\t"
1856 "sb %[p1], -2(%[s3]) \n\t"
1857 "sb %[p0], -3(%[s3]) \n\t"
1858 :
1859 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
1860 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1861
1862 __asm__ __volatile__(
1863 "srl %[p5], %[p5], 8 \n\t"
1864 "srl %[p4], %[p4], 8 \n\t"
1865 "srl %[p3], %[p3], 8 \n\t"
1866 "srl %[p2], %[p2], 8 \n\t"
1867 "srl %[p1], %[p1], 8 \n\t"
1868 "srl %[p0], %[p0], 8 \n\t"
1869 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
1870 [p1] "+r"(p1), [p0] "+r"(p0)
1871 :);
1872
1873 __asm__ __volatile__(
1874 "sb %[p5], 2(%[s2]) \n\t"
1875 "sb %[p4], 1(%[s2]) \n\t"
1876 "sb %[p3], 0(%[s2]) \n\t"
1877 "sb %[p2], -1(%[s2]) \n\t"
1878 "sb %[p1], -2(%[s2]) \n\t"
1879 "sb %[p0], -3(%[s2]) \n\t"
1880 :
1881 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
1882 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1883
1884 __asm__ __volatile__(
1885 "srl %[p5], %[p5], 8 \n\t"
1886 "srl %[p4], %[p4], 8 \n\t"
1887 "srl %[p3], %[p3], 8 \n\t"
1888 "srl %[p2], %[p2], 8 \n\t"
1889 "srl %[p1], %[p1], 8 \n\t"
1890 "srl %[p0], %[p0], 8 \n\t"
1891 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
1892 [p1] "+r"(p1), [p0] "+r"(p0)
1893 :);
1894
1895 __asm__ __volatile__(
1896 "sb %[p5], 2(%[s1]) \n\t"
1897 "sb %[p4], 1(%[s1]) \n\t"
1898 "sb %[p3], 0(%[s1]) \n\t"
1899 "sb %[p2], -1(%[s1]) \n\t"
1900 "sb %[p1], -2(%[s1]) \n\t"
1901 "sb %[p0], -3(%[s1]) \n\t"
1902 :
1903 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
1904 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
1905 }
1906 }
1907
1908 i += 4;
1909 }
1910
1911 while (i < count);
1912 }
1913
vp8_mbloop_filter_uvvertical_edge_mips(unsigned char * s,int p,unsigned int flimit,unsigned int limit,unsigned int thresh,int count)1914 void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
1915 unsigned int flimit,
1916 unsigned int limit,
1917 unsigned int thresh, int count) {
1918 uint32_t mask, hev;
1919 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1920 unsigned char *s1, *s2, *s3, *s4;
1921 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1922 (void)count;
1923
1924 mask = 0;
1925 hev = 0;
1926 pm1 = 0;
1927 p0 = 0;
1928 p1 = 0;
1929 p2 = 0;
1930 p3 = 0;
1931 p4 = 0;
1932 p5 = 0;
1933 p6 = 0;
1934
1935 /* loop filter designed to work using chars so that we can make maximum use
1936 * of 8 bit simd instructions.
1937 */
1938
1939 /* apply filter on 4 pixesl at the same time */
1940
1941 s1 = s;
1942 s2 = s + p;
1943 s3 = s2 + p;
1944 s4 = s3 + p;
1945
1946 /* prefetch data for load */
1947 prefetch_load_lf(s + 2 * p);
1948
1949 /* load quad-byte vectors
1950 * memory is 4 byte aligned
1951 */
1952 p2 = *((uint32_t *)(s1 - 4));
1953 p6 = *((uint32_t *)(s1));
1954 p1 = *((uint32_t *)(s2 - 4));
1955 p5 = *((uint32_t *)(s2));
1956 p0 = *((uint32_t *)(s3 - 4));
1957 p4 = *((uint32_t *)(s3));
1958 pm1 = *((uint32_t *)(s4 - 4));
1959 p3 = *((uint32_t *)(s4));
1960
1961 /* transpose pm1, p0, p1, p2 */
1962 __asm__ __volatile__(
1963 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
1964 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
1965 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
1966 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
1967
1968 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
1969 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
1970 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1971 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1972
1973 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
1974 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
1975 "append %[p1], %[sec3], 16 \n\t"
1976 "append %[pm1], %[sec4], 16 \n\t"
1977
1978 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1979 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
1980 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1981 :);
1982
1983 /* transpose p3, p4, p5, p6 */
1984 __asm__ __volatile__(
1985 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
1986 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
1987 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
1988 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
1989
1990 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
1991 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
1992 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
1993 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
1994
1995 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
1996 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
1997 "append %[p5], %[sec3], 16 \n\t"
1998 "append %[p3], %[sec4], 16 \n\t"
1999
2000 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
2001 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
2002 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
2003 :);
2004
2005 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2006 * mask will be zero and filtering is not needed
2007 */
2008 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
2009 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
2010 thresh, &hev, &mask);
2011
2012 /* if mask == 0 do filtering is not needed */
2013 if (mask) {
2014 /* filtering */
2015 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2016
2017 /* don't use transpose on output data
2018 * because memory isn't aligned
2019 */
2020 __asm__ __volatile__(
2021 "sb %[p5], 2(%[s4]) \n\t"
2022 "sb %[p4], 1(%[s4]) \n\t"
2023 "sb %[p3], 0(%[s4]) \n\t"
2024 "sb %[p2], -1(%[s4]) \n\t"
2025 "sb %[p1], -2(%[s4]) \n\t"
2026 "sb %[p0], -3(%[s4]) \n\t"
2027 :
2028 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
2029 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2030
2031 __asm__ __volatile__(
2032 "srl %[p5], %[p5], 8 \n\t"
2033 "srl %[p4], %[p4], 8 \n\t"
2034 "srl %[p3], %[p3], 8 \n\t"
2035 "srl %[p2], %[p2], 8 \n\t"
2036 "srl %[p1], %[p1], 8 \n\t"
2037 "srl %[p0], %[p0], 8 \n\t"
2038 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2039 [p1] "+r"(p1), [p0] "+r"(p0)
2040 :);
2041
2042 __asm__ __volatile__(
2043 "sb %[p5], 2(%[s3]) \n\t"
2044 "sb %[p4], 1(%[s3]) \n\t"
2045 "sb %[p3], 0(%[s3]) \n\t"
2046 "sb %[p2], -1(%[s3]) \n\t"
2047 "sb %[p1], -2(%[s3]) \n\t"
2048 "sb %[p0], -3(%[s3]) \n\t"
2049 :
2050 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
2051 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2052
2053 __asm__ __volatile__(
2054 "srl %[p5], %[p5], 8 \n\t"
2055 "srl %[p4], %[p4], 8 \n\t"
2056 "srl %[p3], %[p3], 8 \n\t"
2057 "srl %[p2], %[p2], 8 \n\t"
2058 "srl %[p1], %[p1], 8 \n\t"
2059 "srl %[p0], %[p0], 8 \n\t"
2060 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2061 [p1] "+r"(p1), [p0] "+r"(p0)
2062 :);
2063
2064 __asm__ __volatile__(
2065 "sb %[p5], 2(%[s2]) \n\t"
2066 "sb %[p4], 1(%[s2]) \n\t"
2067 "sb %[p3], 0(%[s2]) \n\t"
2068 "sb %[p2], -1(%[s2]) \n\t"
2069 "sb %[p1], -2(%[s2]) \n\t"
2070 "sb %[p0], -3(%[s2]) \n\t"
2071 :
2072 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
2073 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2074
2075 __asm__ __volatile__(
2076 "srl %[p5], %[p5], 8 \n\t"
2077 "srl %[p4], %[p4], 8 \n\t"
2078 "srl %[p3], %[p3], 8 \n\t"
2079 "srl %[p2], %[p2], 8 \n\t"
2080 "srl %[p1], %[p1], 8 \n\t"
2081 "srl %[p0], %[p0], 8 \n\t"
2082 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2083 [p1] "+r"(p1), [p0] "+r"(p0)
2084 :);
2085
2086 __asm__ __volatile__(
2087 "sb %[p5], 2(%[s1]) \n\t"
2088 "sb %[p4], 1(%[s1]) \n\t"
2089 "sb %[p3], 0(%[s1]) \n\t"
2090 "sb %[p2], -1(%[s1]) \n\t"
2091 "sb %[p1], -2(%[s1]) \n\t"
2092 "sb %[p0], -3(%[s1]) \n\t"
2093 :
2094 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
2095 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2096 }
2097 }
2098
2099 s1 = s4 + p;
2100 s2 = s1 + p;
2101 s3 = s2 + p;
2102 s4 = s3 + p;
2103
2104 /* load quad-byte vectors
2105 * memory is 4 byte aligned
2106 */
2107 p2 = *((uint32_t *)(s1 - 4));
2108 p6 = *((uint32_t *)(s1));
2109 p1 = *((uint32_t *)(s2 - 4));
2110 p5 = *((uint32_t *)(s2));
2111 p0 = *((uint32_t *)(s3 - 4));
2112 p4 = *((uint32_t *)(s3));
2113 pm1 = *((uint32_t *)(s4 - 4));
2114 p3 = *((uint32_t *)(s4));
2115
2116 /* transpose pm1, p0, p1, p2 */
2117 __asm__ __volatile__(
2118 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
2119 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
2120 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
2121 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
2122
2123 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
2124 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
2125 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
2126 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
2127
2128 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
2129 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
2130 "append %[p1], %[sec3], 16 \n\t"
2131 "append %[pm1], %[sec4], 16 \n\t"
2132
2133 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
2134 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
2135 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
2136 :);
2137
2138 /* transpose p3, p4, p5, p6 */
2139 __asm__ __volatile__(
2140 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
2141 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
2142 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
2143 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
2144
2145 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
2146 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
2147 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
2148 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
2149
2150 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
2151 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
2152 "append %[p5], %[sec3], 16 \n\t"
2153 "append %[p3], %[sec4], 16 \n\t"
2154
2155 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
2156 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
2157 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
2158 :);
2159
2160 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2161 * mask will be zero and filtering is not needed
2162 */
2163 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
2164 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
2165 thresh, &hev, &mask);
2166
2167 /* if mask == 0 do filtering is not needed */
2168 if (mask) {
2169 /* filtering */
2170 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
2171
2172 /* don't use transpose on output data
2173 * because memory isn't aligned
2174 */
2175 __asm__ __volatile__(
2176 "sb %[p5], 2(%[s4]) \n\t"
2177 "sb %[p4], 1(%[s4]) \n\t"
2178 "sb %[p3], 0(%[s4]) \n\t"
2179 "sb %[p2], -1(%[s4]) \n\t"
2180 "sb %[p1], -2(%[s4]) \n\t"
2181 "sb %[p0], -3(%[s4]) \n\t"
2182 :
2183 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
2184 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2185
2186 __asm__ __volatile__(
2187 "srl %[p5], %[p5], 8 \n\t"
2188 "srl %[p4], %[p4], 8 \n\t"
2189 "srl %[p3], %[p3], 8 \n\t"
2190 "srl %[p2], %[p2], 8 \n\t"
2191 "srl %[p1], %[p1], 8 \n\t"
2192 "srl %[p0], %[p0], 8 \n\t"
2193 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2194 [p1] "+r"(p1), [p0] "+r"(p0)
2195 :);
2196
2197 __asm__ __volatile__(
2198 "sb %[p5], 2(%[s3]) \n\t"
2199 "sb %[p4], 1(%[s3]) \n\t"
2200 "sb %[p3], 0(%[s3]) \n\t"
2201 "sb %[p2], -1(%[s3]) \n\t"
2202 "sb %[p1], -2(%[s3]) \n\t"
2203 "sb %[p0], -3(%[s3]) \n\t"
2204 :
2205 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
2206 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2207
2208 __asm__ __volatile__(
2209 "srl %[p5], %[p5], 8 \n\t"
2210 "srl %[p4], %[p4], 8 \n\t"
2211 "srl %[p3], %[p3], 8 \n\t"
2212 "srl %[p2], %[p2], 8 \n\t"
2213 "srl %[p1], %[p1], 8 \n\t"
2214 "srl %[p0], %[p0], 8 \n\t"
2215 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2216 [p1] "+r"(p1), [p0] "+r"(p0)
2217 :);
2218
2219 __asm__ __volatile__(
2220 "sb %[p5], 2(%[s2]) \n\t"
2221 "sb %[p4], 1(%[s2]) \n\t"
2222 "sb %[p3], 0(%[s2]) \n\t"
2223 "sb %[p2], -1(%[s2]) \n\t"
2224 "sb %[p1], -2(%[s2]) \n\t"
2225 "sb %[p0], -3(%[s2]) \n\t"
2226 :
2227 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
2228 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2229
2230 __asm__ __volatile__(
2231 "srl %[p5], %[p5], 8 \n\t"
2232 "srl %[p4], %[p4], 8 \n\t"
2233 "srl %[p3], %[p3], 8 \n\t"
2234 "srl %[p2], %[p2], 8 \n\t"
2235 "srl %[p1], %[p1], 8 \n\t"
2236 "srl %[p0], %[p0], 8 \n\t"
2237 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
2238 [p1] "+r"(p1), [p0] "+r"(p0)
2239 :);
2240
2241 __asm__ __volatile__(
2242 "sb %[p5], 2(%[s1]) \n\t"
2243 "sb %[p4], 1(%[s1]) \n\t"
2244 "sb %[p3], 0(%[s1]) \n\t"
2245 "sb %[p2], -1(%[s1]) \n\t"
2246 "sb %[p1], -2(%[s1]) \n\t"
2247 "sb %[p0], -3(%[s1]) \n\t"
2248 :
2249 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
2250 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
2251 }
2252 }
2253 }
2254
2255 /* Horizontal MB filtering */
vp8_loop_filter_mbh_dspr2(unsigned char * y_ptr,unsigned char * u_ptr,unsigned char * v_ptr,int y_stride,int uv_stride,loop_filter_info * lfi)2256 void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2257 unsigned char *v_ptr, int y_stride,
2258 int uv_stride, loop_filter_info *lfi) {
2259 unsigned int thresh_vec, flimit_vec, limit_vec;
2260 unsigned char thresh, flimit, limit, flimit_temp;
2261
2262 /* use direct value instead pointers */
2263 limit = *(lfi->lim);
2264 flimit_temp = *(lfi->mblim);
2265 thresh = *(lfi->hev_thr);
2266 flimit = flimit_temp;
2267
2268 /* create quad-byte */
2269 __asm__ __volatile__(
2270 "replv.qb %[thresh_vec], %[thresh] \n\t"
2271 "replv.qb %[flimit_vec], %[flimit] \n\t"
2272 "replv.qb %[limit_vec], %[limit] \n\t"
2273 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2274 [limit_vec] "=r"(limit_vec)
2275 : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2276
2277 vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
2278 thresh_vec, 16);
2279
2280 if (u_ptr) {
2281 vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec,
2282 limit_vec, thresh_vec, 0);
2283 }
2284
2285 if (v_ptr) {
2286 vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec,
2287 limit_vec, thresh_vec, 0);
2288 }
2289 }
2290
2291 /* Vertical MB Filtering */
vp8_loop_filter_mbv_dspr2(unsigned char * y_ptr,unsigned char * u_ptr,unsigned char * v_ptr,int y_stride,int uv_stride,loop_filter_info * lfi)2292 void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2293 unsigned char *v_ptr, int y_stride,
2294 int uv_stride, loop_filter_info *lfi) {
2295 unsigned int thresh_vec, flimit_vec, limit_vec;
2296 unsigned char thresh, flimit, limit, flimit_temp;
2297
2298 /* use direct value instead pointers */
2299 limit = *(lfi->lim);
2300 flimit_temp = *(lfi->mblim);
2301 thresh = *(lfi->hev_thr);
2302 flimit = flimit_temp;
2303
2304 /* create quad-byte */
2305 __asm__ __volatile__(
2306 "replv.qb %[thresh_vec], %[thresh] \n\t"
2307 "replv.qb %[flimit_vec], %[flimit] \n\t"
2308 "replv.qb %[limit_vec], %[limit] \n\t"
2309 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2310 [limit_vec] "=r"(limit_vec)
2311 : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2312
2313 vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
2314 thresh_vec, 16);
2315
2316 if (u_ptr)
2317 vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec,
2318 limit_vec, thresh_vec, 0);
2319
2320 if (v_ptr)
2321 vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec,
2322 limit_vec, thresh_vec, 0);
2323 }
2324
2325 /* Horizontal B Filtering */
vp8_loop_filter_bh_dspr2(unsigned char * y_ptr,unsigned char * u_ptr,unsigned char * v_ptr,int y_stride,int uv_stride,loop_filter_info * lfi)2326 void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2327 unsigned char *v_ptr, int y_stride, int uv_stride,
2328 loop_filter_info *lfi) {
2329 unsigned int thresh_vec, flimit_vec, limit_vec;
2330 unsigned char thresh, flimit, limit, flimit_temp;
2331
2332 /* use direct value instead pointers */
2333 limit = *(lfi->lim);
2334 flimit_temp = *(lfi->blim);
2335 thresh = *(lfi->hev_thr);
2336 flimit = flimit_temp;
2337
2338 /* create quad-byte */
2339 __asm__ __volatile__(
2340 "replv.qb %[thresh_vec], %[thresh] \n\t"
2341 "replv.qb %[flimit_vec], %[flimit] \n\t"
2342 "replv.qb %[limit_vec], %[limit] \n\t"
2343 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2344 [limit_vec] "=r"(limit_vec)
2345 : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2346
2347 vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride,
2348 flimit_vec, limit_vec, thresh_vec, 16);
2349 vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride,
2350 flimit_vec, limit_vec, thresh_vec, 16);
2351 vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride,
2352 flimit_vec, limit_vec, thresh_vec, 16);
2353
2354 if (u_ptr)
2355 vp8_loop_filter_uvhorizontal_edge_mips(
2356 u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2357
2358 if (v_ptr)
2359 vp8_loop_filter_uvhorizontal_edge_mips(
2360 v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
2361 }
2362
2363 /* Vertical B Filtering */
vp8_loop_filter_bv_dspr2(unsigned char * y_ptr,unsigned char * u_ptr,unsigned char * v_ptr,int y_stride,int uv_stride,loop_filter_info * lfi)2364 void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
2365 unsigned char *v_ptr, int y_stride, int uv_stride,
2366 loop_filter_info *lfi) {
2367 unsigned int thresh_vec, flimit_vec, limit_vec;
2368 unsigned char thresh, flimit, limit, flimit_temp;
2369
2370 /* use direct value instead pointers */
2371 limit = *(lfi->lim);
2372 flimit_temp = *(lfi->blim);
2373 thresh = *(lfi->hev_thr);
2374 flimit = flimit_temp;
2375
2376 /* create quad-byte */
2377 __asm__ __volatile__(
2378 "replv.qb %[thresh_vec], %[thresh] \n\t"
2379 "replv.qb %[flimit_vec], %[flimit] \n\t"
2380 "replv.qb %[limit_vec], %[limit] \n\t"
2381 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
2382 [limit_vec] "=r"(limit_vec)
2383 : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
2384
2385 vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec,
2386 thresh_vec, 16);
2387 vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec,
2388 thresh_vec, 16);
2389 vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec,
2390 limit_vec, thresh_vec, 16);
2391
2392 if (u_ptr)
2393 vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec,
2394 limit_vec, thresh_vec, 0);
2395
2396 if (v_ptr)
2397 vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec,
2398 limit_vec, thresh_vec, 0);
2399 }
2400
2401 #endif
2402