1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stdlib.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/mips/common_dspr2.h"
16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19 #include "vpx_mem/vpx_mem.h"
20
21 #if HAVE_DSPR2
vpx_lpf_horizontal_4_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)22 void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
23 const uint8_t *blimit, const uint8_t *limit,
24 const uint8_t *thresh) {
25 uint8_t i;
26 uint32_t mask;
27 uint32_t hev;
28 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
29 uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
30 uint32_t thresh_vec, flimit_vec, limit_vec;
31 uint32_t uflimit, ulimit, uthresh;
32
33 uflimit = *blimit;
34 ulimit = *limit;
35 uthresh = *thresh;
36
37 /* create quad-byte */
38 __asm__ __volatile__(
39 "replv.qb %[thresh_vec], %[uthresh] \n\t"
40 "replv.qb %[flimit_vec], %[uflimit] \n\t"
41 "replv.qb %[limit_vec], %[ulimit] \n\t"
42
43 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
44 [limit_vec] "=r"(limit_vec)
45 : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
46
47 /* prefetch data for store */
48 prefetch_store(s);
49
50 /* loop filter designed to work using chars so that we can make maximum use
51 of 8 bit simd instructions. */
52 for (i = 0; i < 2; i++) {
53 sm1 = s - (pitch << 2);
54 s0 = sm1 + pitch;
55 s1 = s0 + pitch;
56 s2 = s - pitch;
57 s3 = s;
58 s4 = s + pitch;
59 s5 = s4 + pitch;
60 s6 = s5 + pitch;
61
62 __asm__ __volatile__(
63 "lw %[p1], (%[s1]) \n\t"
64 "lw %[p2], (%[s2]) \n\t"
65 "lw %[p3], (%[s3]) \n\t"
66 "lw %[p4], (%[s4]) \n\t"
67
68 : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
69 : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
70
71 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
72 mask will be zero and filtering is not needed */
73 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
74 __asm__ __volatile__(
75 "lw %[pm1], (%[sm1]) \n\t"
76 "lw %[p0], (%[s0]) \n\t"
77 "lw %[p5], (%[s5]) \n\t"
78 "lw %[p6], (%[s6]) \n\t"
79
80 : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
81 : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
82
83 filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
84 p6, thresh_vec, &hev, &mask);
85
86 /* if mask == 0 do filtering is not needed */
87 if (mask) {
88 /* filtering */
89 filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
90
91 __asm__ __volatile__(
92 "sw %[p1], (%[s1]) \n\t"
93 "sw %[p2], (%[s2]) \n\t"
94 "sw %[p3], (%[s3]) \n\t"
95 "sw %[p4], (%[s4]) \n\t"
96
97 :
98 : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
99 [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
100 }
101 }
102
103 s = s + 4;
104 }
105 }
106
vpx_lpf_vertical_4_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)107 void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
108 const uint8_t *blimit, const uint8_t *limit,
109 const uint8_t *thresh) {
110 uint8_t i;
111 uint32_t mask, hev;
112 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
113 uint8_t *s1, *s2, *s3, *s4;
114 uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
115 uint32_t thresh_vec, flimit_vec, limit_vec;
116 uint32_t uflimit, ulimit, uthresh;
117
118 uflimit = *blimit;
119 ulimit = *limit;
120 uthresh = *thresh;
121
122 /* create quad-byte */
123 __asm__ __volatile__(
124 "replv.qb %[thresh_vec], %[uthresh] \n\t"
125 "replv.qb %[flimit_vec], %[uflimit] \n\t"
126 "replv.qb %[limit_vec], %[ulimit] \n\t"
127
128 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
129 [limit_vec] "=r"(limit_vec)
130 : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
131
132 /* prefetch data for store */
133 prefetch_store(s + pitch);
134
135 for (i = 0; i < 2; i++) {
136 s1 = s;
137 s2 = s + pitch;
138 s3 = s2 + pitch;
139 s4 = s3 + pitch;
140 s = s4 + pitch;
141
142 /* load quad-byte vectors
143 * memory is 4 byte aligned
144 */
145 p2 = *((uint32_t *)(s1 - 4));
146 p6 = *((uint32_t *)(s1));
147 p1 = *((uint32_t *)(s2 - 4));
148 p5 = *((uint32_t *)(s2));
149 p0 = *((uint32_t *)(s3 - 4));
150 p4 = *((uint32_t *)(s3));
151 pm1 = *((uint32_t *)(s4 - 4));
152 p3 = *((uint32_t *)(s4));
153
154 /* transpose pm1, p0, p1, p2 */
155 __asm__ __volatile__(
156 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
157 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
158 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
159 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
160
161 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
162 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
163 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
164 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
165
166 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
167 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
168 "append %[p1], %[sec3], 16 \n\t"
169 "append %[pm1], %[sec4], 16 \n\t"
170
171 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
172 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
173 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
174 :);
175
176 /* transpose p3, p4, p5, p6 */
177 __asm__ __volatile__(
178 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
179 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
180 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
181 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
182
183 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
184 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
185 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
186 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
187
188 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
189 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
190 "append %[p5], %[sec3], 16 \n\t"
191 "append %[p3], %[sec4], 16 \n\t"
192
193 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
194 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
195 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
196 :);
197
198 /* if (p1 - p4 == 0) and (p2 - p3 == 0)
199 * mask will be zero and filtering is not needed
200 */
201 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
202 filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
203 p6, thresh_vec, &hev, &mask);
204
205 /* if mask == 0 do filtering is not needed */
206 if (mask) {
207 /* filtering */
208 filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
209
210 /* unpack processed 4x4 neighborhood
211 * don't use transpose on output data
212 * because memory isn't aligned
213 */
214 __asm__ __volatile__(
215 "sb %[p4], 1(%[s4]) \n\t"
216 "sb %[p3], 0(%[s4]) \n\t"
217 "sb %[p2], -1(%[s4]) \n\t"
218 "sb %[p1], -2(%[s4]) \n\t"
219
220 :
221 : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
222 [s4] "r"(s4));
223
224 __asm__ __volatile__(
225 "srl %[p4], %[p4], 8 \n\t"
226 "srl %[p3], %[p3], 8 \n\t"
227 "srl %[p2], %[p2], 8 \n\t"
228 "srl %[p1], %[p1], 8 \n\t"
229
230 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
231 :);
232
233 __asm__ __volatile__(
234 "sb %[p4], 1(%[s3]) \n\t"
235 "sb %[p3], 0(%[s3]) \n\t"
236 "sb %[p2], -1(%[s3]) \n\t"
237 "sb %[p1], -2(%[s3]) \n\t"
238
239 : [p1] "+r"(p1)
240 : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
241
242 __asm__ __volatile__(
243 "srl %[p4], %[p4], 8 \n\t"
244 "srl %[p3], %[p3], 8 \n\t"
245 "srl %[p2], %[p2], 8 \n\t"
246 "srl %[p1], %[p1], 8 \n\t"
247
248 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
249 :);
250
251 __asm__ __volatile__(
252 "sb %[p4], 1(%[s2]) \n\t"
253 "sb %[p3], 0(%[s2]) \n\t"
254 "sb %[p2], -1(%[s2]) \n\t"
255 "sb %[p1], -2(%[s2]) \n\t"
256
257 :
258 : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
259 [s2] "r"(s2));
260
261 __asm__ __volatile__(
262 "srl %[p4], %[p4], 8 \n\t"
263 "srl %[p3], %[p3], 8 \n\t"
264 "srl %[p2], %[p2], 8 \n\t"
265 "srl %[p1], %[p1], 8 \n\t"
266
267 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
268 :);
269
270 __asm__ __volatile__(
271 "sb %[p4], 1(%[s1]) \n\t"
272 "sb %[p3], 0(%[s1]) \n\t"
273 "sb %[p2], -1(%[s1]) \n\t"
274 "sb %[p1], -2(%[s1]) \n\t"
275
276 :
277 : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
278 [s1] "r"(s1));
279 }
280 }
281 }
282 }
283
vpx_lpf_horizontal_4_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)284 void vpx_lpf_horizontal_4_dual_dspr2(
285 uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
286 const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
287 const uint8_t *limit1, const uint8_t *thresh1) {
288 vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
289 vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
290 }
291
vpx_lpf_horizontal_8_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)292 void vpx_lpf_horizontal_8_dual_dspr2(
293 uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
294 const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
295 const uint8_t *limit1, const uint8_t *thresh1) {
296 vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
297 vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
298 }
299
vpx_lpf_vertical_4_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)300 void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
301 const uint8_t *limit0,
302 const uint8_t *thresh0,
303 const uint8_t *blimit1,
304 const uint8_t *limit1,
305 const uint8_t *thresh1) {
306 vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
307 vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
308 }
309
vpx_lpf_vertical_8_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)310 void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
311 const uint8_t *limit0,
312 const uint8_t *thresh0,
313 const uint8_t *blimit1,
314 const uint8_t *limit1,
315 const uint8_t *thresh1) {
316 vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
317 vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
318 }
319
vpx_lpf_vertical_16_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)320 void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
321 const uint8_t *limit,
322 const uint8_t *thresh) {
323 vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
324 vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
325 }
326 #endif // #if HAVE_DSPR2
327