• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <stdlib.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/mips/common_dspr2.h"
16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19 #include "vpx_mem/vpx_mem.h"
20 
21 #if HAVE_DSPR2
vpx_lpf_horizontal_4_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)22 void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
23                                 int pitch,
24                                 const uint8_t *blimit,
25                                 const uint8_t *limit,
26                                 const uint8_t *thresh,
27                                 int count) {
28   uint8_t   i;
29   uint32_t  mask;
30   uint32_t  hev;
31   uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
32   uint8_t   *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
33   uint32_t  thresh_vec, flimit_vec, limit_vec;
34   uint32_t  uflimit, ulimit, uthresh;
35 
36   uflimit = *blimit;
37   ulimit = *limit;
38   uthresh = *thresh;
39 
40   /* create quad-byte */
41   __asm__ __volatile__ (
42       "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
43       "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
44       "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
45 
46       : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
47         [limit_vec] "=r" (limit_vec)
48       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
49   );
50 
51   /* prefetch data for store */
52   prefetch_store(s);
53 
54   /* loop filter designed to work using chars so that we can make maximum use
55      of 8 bit simd instructions. */
56   for (i = 0; i < 2; i++) {
57     sm1 = s - (pitch << 2);
58     s0 = sm1 + pitch;
59     s1 = s0 + pitch;
60     s2 = s - pitch;
61     s3 = s;
62     s4 = s + pitch;
63     s5 = s4 + pitch;
64     s6 = s5 + pitch;
65 
66     __asm__ __volatile__ (
67         "lw     %[p1],  (%[s1])    \n\t"
68         "lw     %[p2],  (%[s2])    \n\t"
69         "lw     %[p3],  (%[s3])    \n\t"
70         "lw     %[p4],  (%[s4])    \n\t"
71 
72         : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
73         : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
74     );
75 
76     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
77        mask will be zero and filtering is not needed */
78     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
79       __asm__ __volatile__ (
80           "lw       %[pm1], (%[sm1])   \n\t"
81           "lw       %[p0],  (%[s0])    \n\t"
82           "lw       %[p5],  (%[s5])    \n\t"
83           "lw       %[p6],  (%[s6])    \n\t"
84 
85           : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
86             [p6] "=&r" (p6)
87           : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
88       );
89 
90       filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
91                             pm1, p0, p3, p4, p5, p6,
92                             thresh_vec, &hev, &mask);
93 
94       /* if mask == 0 do filtering is not needed */
95       if (mask) {
96         /* filtering */
97         filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
98 
99         __asm__ __volatile__ (
100             "sw     %[p1],  (%[s1])    \n\t"
101             "sw     %[p2],  (%[s2])    \n\t"
102             "sw     %[p3],  (%[s3])    \n\t"
103             "sw     %[p4],  (%[s4])    \n\t"
104 
105             :
106             : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
107               [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
108         );
109       }
110     }
111 
112     s = s + 4;
113   }
114 }
115 
vpx_lpf_vertical_4_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)116 void vpx_lpf_vertical_4_dspr2(unsigned char *s,
117                               int pitch,
118                               const uint8_t *blimit,
119                               const uint8_t *limit,
120                               const uint8_t *thresh,
121                               int count) {
122   uint8_t   i;
123   uint32_t  mask, hev;
124   uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
125   uint8_t   *s1, *s2, *s3, *s4;
126   uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
127   uint32_t  thresh_vec, flimit_vec, limit_vec;
128   uint32_t  uflimit, ulimit, uthresh;
129 
130   uflimit = *blimit;
131   ulimit = *limit;
132   uthresh = *thresh;
133 
134   /* create quad-byte */
135   __asm__ __volatile__ (
136       "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
137       "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
138       "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
139 
140       : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
141         [limit_vec] "=r" (limit_vec)
142       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
143   );
144 
145   /* prefetch data for store */
146   prefetch_store(s + pitch);
147 
148   for (i = 0; i < 2; i++) {
149     s1 = s;
150     s2 = s + pitch;
151     s3 = s2 + pitch;
152     s4 = s3 + pitch;
153     s  = s4 + pitch;
154 
155     /* load quad-byte vectors
156      * memory is 4 byte aligned
157      */
158     p2  = *((uint32_t *)(s1 - 4));
159     p6  = *((uint32_t *)(s1));
160     p1  = *((uint32_t *)(s2 - 4));
161     p5  = *((uint32_t *)(s2));
162     p0  = *((uint32_t *)(s3 - 4));
163     p4  = *((uint32_t *)(s3));
164     pm1 = *((uint32_t *)(s4 - 4));
165     p3  = *((uint32_t *)(s4));
166 
167     /* transpose pm1, p0, p1, p2 */
168     __asm__ __volatile__ (
169         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
170         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
171         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
172         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
173 
174         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
175         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
176         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
177         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
178 
179         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
180         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
181         "append         %[p1],      %[sec3],    16          \n\t"
182         "append         %[pm1],     %[sec4],    16          \n\t"
183 
184         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
185           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
186           [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
187           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
188         :
189     );
190 
191     /* transpose p3, p4, p5, p6 */
192     __asm__ __volatile__ (
193         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
194         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
195         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
196         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
197 
198         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
199         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
200         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
201         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
202 
203         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
204         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
205         "append         %[p5],      %[sec3],    16          \n\t"
206         "append         %[p3],      %[sec4],    16          \n\t"
207 
208         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
209           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
210           [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
211           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
212         :
213     );
214 
215     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
216      * mask will be zero and filtering is not needed
217      */
218     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
219       filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
220                             p0, p3, p4, p5, p6, thresh_vec,
221                             &hev, &mask);
222 
223       /* if mask == 0 do filtering is not needed */
224       if (mask) {
225         /* filtering */
226         filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
227 
228         /* unpack processed 4x4 neighborhood
229          * don't use transpose on output data
230          * because memory isn't aligned
231          */
232         __asm__ __volatile__ (
233             "sb     %[p4],   1(%[s4])    \n\t"
234             "sb     %[p3],   0(%[s4])    \n\t"
235             "sb     %[p2],  -1(%[s4])    \n\t"
236             "sb     %[p1],  -2(%[s4])    \n\t"
237 
238             :
239             : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
240               [s4] "r" (s4)
241         );
242 
243         __asm__ __volatile__ (
244             "srl    %[p4],  %[p4],  8     \n\t"
245             "srl    %[p3],  %[p3],  8     \n\t"
246             "srl    %[p2],  %[p2],  8     \n\t"
247             "srl    %[p1],  %[p1],  8     \n\t"
248 
249             : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
250             :
251         );
252 
253         __asm__ __volatile__ (
254             "sb     %[p4],   1(%[s3])    \n\t"
255             "sb     %[p3],   0(%[s3])    \n\t"
256             "sb     %[p2],  -1(%[s3])    \n\t"
257             "sb     %[p1],  -2(%[s3])    \n\t"
258 
259             : [p1] "+r" (p1)
260             : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
261         );
262 
263         __asm__ __volatile__ (
264             "srl    %[p4],  %[p4],  8     \n\t"
265             "srl    %[p3],  %[p3],  8     \n\t"
266             "srl    %[p2],  %[p2],  8     \n\t"
267             "srl    %[p1],  %[p1],  8     \n\t"
268 
269             : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
270             :
271         );
272 
273         __asm__ __volatile__ (
274             "sb     %[p4],   1(%[s2])    \n\t"
275             "sb     %[p3],   0(%[s2])    \n\t"
276             "sb     %[p2],  -1(%[s2])    \n\t"
277             "sb     %[p1],  -2(%[s2])    \n\t"
278 
279             :
280             : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
281               [s2] "r" (s2)
282         );
283 
284         __asm__ __volatile__ (
285             "srl    %[p4],  %[p4],  8     \n\t"
286             "srl    %[p3],  %[p3],  8     \n\t"
287             "srl    %[p2],  %[p2],  8     \n\t"
288             "srl    %[p1],  %[p1],  8     \n\t"
289 
290             : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
291             :
292         );
293 
294         __asm__ __volatile__ (
295             "sb     %[p4],   1(%[s1])    \n\t"
296             "sb     %[p3],   0(%[s1])    \n\t"
297             "sb     %[p2],  -1(%[s1])    \n\t"
298             "sb     %[p1],  -2(%[s1])    \n\t"
299 
300             :
301             : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
302               [s1] "r" (s1)
303         );
304       }
305     }
306   }
307 }
308 
vpx_lpf_horizontal_4_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)309 void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
310                                      const uint8_t *blimit0,
311                                      const uint8_t *limit0,
312                                      const uint8_t *thresh0,
313                                      const uint8_t *blimit1,
314                                      const uint8_t *limit1,
315                                      const uint8_t *thresh1) {
316   vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
317   vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
318 }
319 
vpx_lpf_horizontal_8_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)320 void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
321                                      const uint8_t *blimit0,
322                                      const uint8_t *limit0,
323                                      const uint8_t *thresh0,
324                                      const uint8_t *blimit1,
325                                      const uint8_t *limit1,
326                                      const uint8_t *thresh1) {
327   vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
328   vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
329 }
330 
vpx_lpf_vertical_4_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)331 void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
332                                    const uint8_t *blimit0,
333                                    const uint8_t *limit0,
334                                    const uint8_t *thresh0,
335                                    const uint8_t *blimit1,
336                                    const uint8_t *limit1,
337                                    const uint8_t *thresh1) {
338   vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
339   vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
340 }
341 
vpx_lpf_vertical_8_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)342 void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
343                                    const uint8_t *blimit0,
344                                    const uint8_t *limit0,
345                                    const uint8_t *thresh0,
346                                    const uint8_t *blimit1,
347                                    const uint8_t *limit1,
348                                    const uint8_t *thresh1) {
349   vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
350   vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
351                                        1);
352 }
353 
vpx_lpf_vertical_16_dual_dspr2(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)354 void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
355                                     const uint8_t *blimit,
356                                     const uint8_t *limit,
357                                     const uint8_t *thresh) {
358   vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
359   vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
360 }
361 #endif  // #if HAVE_DSPR2
362