1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stdlib.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/mips/common_dspr2.h"
16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19 #include "vpx_mem/vpx_mem.h"
20
21 #if HAVE_DSPR2
vpx_lpf_horizontal_4_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)22 void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
23 int pitch,
24 const uint8_t *blimit,
25 const uint8_t *limit,
26 const uint8_t *thresh,
27 int count) {
28 uint8_t i;
29 uint32_t mask;
30 uint32_t hev;
31 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
32 uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
33 uint32_t thresh_vec, flimit_vec, limit_vec;
34 uint32_t uflimit, ulimit, uthresh;
35
36 uflimit = *blimit;
37 ulimit = *limit;
38 uthresh = *thresh;
39
40 /* create quad-byte */
41 __asm__ __volatile__ (
42 "replv.qb %[thresh_vec], %[uthresh] \n\t"
43 "replv.qb %[flimit_vec], %[uflimit] \n\t"
44 "replv.qb %[limit_vec], %[ulimit] \n\t"
45
46 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
47 [limit_vec] "=r" (limit_vec)
48 : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
49 );
50
51 /* prefetch data for store */
52 prefetch_store(s);
53
54 /* loop filter designed to work using chars so that we can make maximum use
55 of 8 bit simd instructions. */
56 for (i = 0; i < 2; i++) {
57 sm1 = s - (pitch << 2);
58 s0 = sm1 + pitch;
59 s1 = s0 + pitch;
60 s2 = s - pitch;
61 s3 = s;
62 s4 = s + pitch;
63 s5 = s4 + pitch;
64 s6 = s5 + pitch;
65
66 __asm__ __volatile__ (
67 "lw %[p1], (%[s1]) \n\t"
68 "lw %[p2], (%[s2]) \n\t"
69 "lw %[p3], (%[s3]) \n\t"
70 "lw %[p4], (%[s4]) \n\t"
71
72 : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
73 : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
74 );
75
76 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
77 mask will be zero and filtering is not needed */
78 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
79 __asm__ __volatile__ (
80 "lw %[pm1], (%[sm1]) \n\t"
81 "lw %[p0], (%[s0]) \n\t"
82 "lw %[p5], (%[s5]) \n\t"
83 "lw %[p6], (%[s6]) \n\t"
84
85 : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
86 [p6] "=&r" (p6)
87 : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
88 );
89
90 filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
91 pm1, p0, p3, p4, p5, p6,
92 thresh_vec, &hev, &mask);
93
94 /* if mask == 0 do filtering is not needed */
95 if (mask) {
96 /* filtering */
97 filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
98
99 __asm__ __volatile__ (
100 "sw %[p1], (%[s1]) \n\t"
101 "sw %[p2], (%[s2]) \n\t"
102 "sw %[p3], (%[s3]) \n\t"
103 "sw %[p4], (%[s4]) \n\t"
104
105 :
106 : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
107 [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
108 );
109 }
110 }
111
112 s = s + 4;
113 }
114 }
115
vpx_lpf_vertical_4_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)116 void vpx_lpf_vertical_4_dspr2(unsigned char *s,
117 int pitch,
118 const uint8_t *blimit,
119 const uint8_t *limit,
120 const uint8_t *thresh,
121 int count) {
122 uint8_t i;
123 uint32_t mask, hev;
124 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
125 uint8_t *s1, *s2, *s3, *s4;
126 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
127 uint32_t thresh_vec, flimit_vec, limit_vec;
128 uint32_t uflimit, ulimit, uthresh;
129
130 uflimit = *blimit;
131 ulimit = *limit;
132 uthresh = *thresh;
133
134 /* create quad-byte */
135 __asm__ __volatile__ (
136 "replv.qb %[thresh_vec], %[uthresh] \n\t"
137 "replv.qb %[flimit_vec], %[uflimit] \n\t"
138 "replv.qb %[limit_vec], %[ulimit] \n\t"
139
140 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
141 [limit_vec] "=r" (limit_vec)
142 : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
143 );
144
145 /* prefetch data for store */
146 prefetch_store(s + pitch);
147
148 for (i = 0; i < 2; i++) {
149 s1 = s;
150 s2 = s + pitch;
151 s3 = s2 + pitch;
152 s4 = s3 + pitch;
153 s = s4 + pitch;
154
155 /* load quad-byte vectors
156 * memory is 4 byte aligned
157 */
158 p2 = *((uint32_t *)(s1 - 4));
159 p6 = *((uint32_t *)(s1));
160 p1 = *((uint32_t *)(s2 - 4));
161 p5 = *((uint32_t *)(s2));
162 p0 = *((uint32_t *)(s3 - 4));
163 p4 = *((uint32_t *)(s3));
164 pm1 = *((uint32_t *)(s4 - 4));
165 p3 = *((uint32_t *)(s4));
166
167 /* transpose pm1, p0, p1, p2 */
168 __asm__ __volatile__ (
169 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
170 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
171 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
172 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
173
174 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
175 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
176 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
177 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
178
179 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
180 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
181 "append %[p1], %[sec3], 16 \n\t"
182 "append %[pm1], %[sec4], 16 \n\t"
183
184 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
185 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
186 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
187 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
188 :
189 );
190
191 /* transpose p3, p4, p5, p6 */
192 __asm__ __volatile__ (
193 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
194 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
195 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
196 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
197
198 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
199 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
200 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
201 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
202
203 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
204 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
205 "append %[p5], %[sec3], 16 \n\t"
206 "append %[p3], %[sec4], 16 \n\t"
207
208 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
209 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
210 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
211 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
212 :
213 );
214
215 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
216 * mask will be zero and filtering is not needed
217 */
218 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
219 filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
220 p0, p3, p4, p5, p6, thresh_vec,
221 &hev, &mask);
222
223 /* if mask == 0 do filtering is not needed */
224 if (mask) {
225 /* filtering */
226 filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
227
228 /* unpack processed 4x4 neighborhood
229 * don't use transpose on output data
230 * because memory isn't aligned
231 */
232 __asm__ __volatile__ (
233 "sb %[p4], 1(%[s4]) \n\t"
234 "sb %[p3], 0(%[s4]) \n\t"
235 "sb %[p2], -1(%[s4]) \n\t"
236 "sb %[p1], -2(%[s4]) \n\t"
237
238 :
239 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
240 [s4] "r" (s4)
241 );
242
243 __asm__ __volatile__ (
244 "srl %[p4], %[p4], 8 \n\t"
245 "srl %[p3], %[p3], 8 \n\t"
246 "srl %[p2], %[p2], 8 \n\t"
247 "srl %[p1], %[p1], 8 \n\t"
248
249 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
250 :
251 );
252
253 __asm__ __volatile__ (
254 "sb %[p4], 1(%[s3]) \n\t"
255 "sb %[p3], 0(%[s3]) \n\t"
256 "sb %[p2], -1(%[s3]) \n\t"
257 "sb %[p1], -2(%[s3]) \n\t"
258
259 : [p1] "+r" (p1)
260 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
261 );
262
263 __asm__ __volatile__ (
264 "srl %[p4], %[p4], 8 \n\t"
265 "srl %[p3], %[p3], 8 \n\t"
266 "srl %[p2], %[p2], 8 \n\t"
267 "srl %[p1], %[p1], 8 \n\t"
268
269 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
270 :
271 );
272
273 __asm__ __volatile__ (
274 "sb %[p4], 1(%[s2]) \n\t"
275 "sb %[p3], 0(%[s2]) \n\t"
276 "sb %[p2], -1(%[s2]) \n\t"
277 "sb %[p1], -2(%[s2]) \n\t"
278
279 :
280 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
281 [s2] "r" (s2)
282 );
283
284 __asm__ __volatile__ (
285 "srl %[p4], %[p4], 8 \n\t"
286 "srl %[p3], %[p3], 8 \n\t"
287 "srl %[p2], %[p2], 8 \n\t"
288 "srl %[p1], %[p1], 8 \n\t"
289
290 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
291 :
292 );
293
294 __asm__ __volatile__ (
295 "sb %[p4], 1(%[s1]) \n\t"
296 "sb %[p3], 0(%[s1]) \n\t"
297 "sb %[p2], -1(%[s1]) \n\t"
298 "sb %[p1], -2(%[s1]) \n\t"
299
300 :
301 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
302 [s1] "r" (s1)
303 );
304 }
305 }
306 }
307 }
308
vpx_lpf_horizontal_4_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)309 void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
310 const uint8_t *blimit0,
311 const uint8_t *limit0,
312 const uint8_t *thresh0,
313 const uint8_t *blimit1,
314 const uint8_t *limit1,
315 const uint8_t *thresh1) {
316 vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
317 vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
318 }
319
vpx_lpf_horizontal_8_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)320 void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
321 const uint8_t *blimit0,
322 const uint8_t *limit0,
323 const uint8_t *thresh0,
324 const uint8_t *blimit1,
325 const uint8_t *limit1,
326 const uint8_t *thresh1) {
327 vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
328 vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
329 }
330
vpx_lpf_vertical_4_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)331 void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
332 const uint8_t *blimit0,
333 const uint8_t *limit0,
334 const uint8_t *thresh0,
335 const uint8_t *blimit1,
336 const uint8_t *limit1,
337 const uint8_t *thresh1) {
338 vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
339 vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
340 }
341
vpx_lpf_vertical_8_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)342 void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
343 const uint8_t *blimit0,
344 const uint8_t *limit0,
345 const uint8_t *thresh0,
346 const uint8_t *blimit1,
347 const uint8_t *limit1,
348 const uint8_t *thresh1) {
349 vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
350 vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
351 1);
352 }
353
vpx_lpf_vertical_16_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)354 void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
355 const uint8_t *blimit,
356 const uint8_t *limit,
357 const uint8_t *thresh) {
358 vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
359 vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
360 }
361 #endif // #if HAVE_DSPR2
362