• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2023, VideoLAN and dav1d authors
3  * Copyright © 2023, Loongson Technology Corporation Limited
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "src/loongarch/looprestoration.h"
29 
30 #if BITDEPTH == 8
31 
32 #define REST_UNIT_STRIDE (400)
33 
34 void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr,
35                                     uint8_t *tmp_ptr,
36                                     const int16_t filterh[8],
37                                     const int w, const int h);
38 
39 void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p,
40                                     const ptrdiff_t p_stride,
41                                     const int32_t *hor,
42                                     const int16_t filterv[8],
43                                     const int w, const int h);
44 
45 // This function refers to the function in the ppc/looprestoration_init_tmpl.c.
padding(uint8_t * dst,const uint8_t * p,const ptrdiff_t stride,const uint8_t (* left)[4],const uint8_t * lpf,int unit_w,const int stripe_h,const enum LrEdgeFlags edges)46 static inline void padding(uint8_t *dst, const uint8_t *p,
47                            const ptrdiff_t stride, const uint8_t (*left)[4],
48                            const uint8_t *lpf, int unit_w, const int stripe_h,
49                            const enum LrEdgeFlags edges)
50 {
51     const int have_left = !!(edges & LR_HAVE_LEFT);
52     const int have_right = !!(edges & LR_HAVE_RIGHT);
53 
54     // Copy more pixels if we don't have to pad them
55     unit_w += 3 * have_left + 3 * have_right;
56     uint8_t *dst_l = dst + 3 * !have_left;
57     p -= 3 * have_left;
58     lpf -= 3 * have_left;
59 
60     if (edges & LR_HAVE_TOP) {
61         // Copy previous loop filtered rows
62         const uint8_t *const above_1 = lpf;
63         const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
64         pixel_copy(dst_l, above_1, unit_w);
65         pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
66         pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
67     } else {
68         // Pad with first row
69         pixel_copy(dst_l, p, unit_w);
70         pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
71         pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
72         if (have_left) {
73             pixel_copy(dst_l, &left[0][1], 3);
74             pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
75             pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
76         }
77     }
78 
79     uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
80     if (edges & LR_HAVE_BOTTOM) {
81         // Copy next loop filtered rows
82         const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
83         const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
84         pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
85         pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
86         pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
87     } else {
88         // Pad with last row
89         const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
90         pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
91         pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
92         pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
93         if (have_left) {
94             pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
95             pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
96             pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
97         }
98     }
99 
100     // Inner UNIT_WxSTRIPE_H
101     for (int j = 0; j < stripe_h; j++) {
102         pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
103         dst_tl += REST_UNIT_STRIDE;
104         p += PXSTRIDE(stride);
105     }
106 
107     if (!have_right) {
108         uint8_t *pad = dst_l + unit_w;
109         uint8_t *row_last = &dst_l[unit_w - 1];
110         // Pad 3x(STRIPE_H+6) with last column
111         for (int j = 0; j < stripe_h + 6; j++) {
112             pixel_set(pad, *row_last, 3);
113             pad += REST_UNIT_STRIDE;
114             row_last += REST_UNIT_STRIDE;
115         }
116     }
117 
118     if (!have_left) {
119         // Pad 3x(STRIPE_H+6) with first column
120         for (int j = 0; j < stripe_h + 6; j++) {
121             pixel_set(dst, *dst_l, 3);
122             dst += REST_UNIT_STRIDE;
123             dst_l += REST_UNIT_STRIDE;
124         }
125     } else {
126         dst += 3 * REST_UNIT_STRIDE;
127         for (int j = 0; j < stripe_h; j++) {
128             pixel_copy(dst, &left[j][1], 3);
129             dst += REST_UNIT_STRIDE;
130         }
131     }
132 }
133 
134 // This function refers to the function in the ppc/looprestoration_init_tmpl.c.
135 
136 // FIXME Could split into luma and chroma specific functions,
137 // (since first and last tops are always 0 for chroma)
138 // FIXME Could implement a version that requires less temporary memory
139 // (should be possible to implement with only 6 rows of temp storage)
dav1d_wiener_filter_lsx(uint8_t * p,const ptrdiff_t p_stride,const uint8_t (* const left)[4],const uint8_t * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)140 void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride,
141                               const uint8_t (*const left)[4],
142                               const uint8_t *lpf,
143                               const int w, const int h,
144                               const LooprestorationParams *const params,
145                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
146 {
147     const int16_t (*const filter)[8] = params->filter;
148 
149     // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
150     // of padding above and below
151     ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
152     padding(tmp, p, p_stride, left, lpf, w, h, edges);
153     ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
154 
155     BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6);
156     BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h);
157 }
158 
159 void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src,
160                               const int w, const int h);
161 void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum,
162                               const int w, const int h);
163 
164 void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
165                           const int w, const int h, const int w1);
166 void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp,
167                           int32_t *sumsq, int16_t *sum,
168                           const int w, const int h);
169 void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride,
170                                     int16_t *dst, int w1,
171                                     const int w, const int h);
172 
173 
boxsum3_lsx(int32_t * sumsq,coef * sum,pixel * src,const int w,const int h)174 static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src,
175                                const int w, const int h)
176 {
177     BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6);
178     BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6);
179 }
180 
dav1d_sgr_filter_3x3_lsx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)181 void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
182                               const pixel (*const left)[4],
183                               const pixel *lpf,
184                               const int w, const int h,
185                               const LooprestorationParams *const params,
186                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
187 {
188     ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
189     padding(tmp, p, p_stride, left, lpf, w, h, edges);
190     coef dst[64 * 384];
191 
192     ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
193     ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
194 
195     boxsum3_lsx(sumsq, sum, tmp, w, h);
196     BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1);
197     BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
198     BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h);
199 }
200 
201 void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum,
202                               const uint8_t *const src,
203                               const int w, const int h);
204 
205 void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum,
206                               const int w, const int h);
207 
208 void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
209                                   const int w, const int h,
210                                   const unsigned s);
211 
212 void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src,
213                                   int32_t *sumsq, int16_t *sum,
214                                   const int w, const int h);
215 
216 void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride,
217                                    const int16_t *dst0, const int16_t *dst1,
218                                    const int w0, const int w1,
219                                    const int w, const int h);
220 
boxsum5_lsx(int32_t * sumsq,coef * sum,pixel * src,const int w,const int h)221 static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src,
222                                const int w, const int h)
223 {
224     BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6);
225     BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6);
226 }
227 
dav1d_sgr_filter_5x5_lsx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)228 void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
229                               const pixel (*const left)[4],
230                               const pixel *lpf,
231                               const int w, const int h,
232                               const LooprestorationParams *const params,
233                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
234 {
235     ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
236     padding(tmp, p, p_stride, left, lpf, w, h, edges);
237     coef dst[64 * 384];
238 
239     ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
240     ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
241 
242     boxsum5_lsx(sumsq, sum, tmp, w, h);
243     BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0);
244     BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
245     BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h);
246 }
247 
dav1d_sgr_filter_mix_lsx(pixel * p,const ptrdiff_t p_stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)248 void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
249                               const pixel (*const left)[4],
250                               const pixel *lpf,
251                               const int w, const int h,
252                               const LooprestorationParams *const params,
253                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
254 {
255     ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
256     padding(tmp, p, p_stride, left, lpf, w, h, edges);
257     coef dst0[64 * 384];
258     coef dst1[64 * 384];
259 
260     ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, );
261     ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, );
262 
263     boxsum5_lsx(sumsq0, sum0, tmp, w, h);
264     BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0);
265     BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h);
266 
267     boxsum3_lsx(sumsq0, sum0, tmp, w, h);
268     BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1);
269     BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h);
270 
271     BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0,
272                                    params->sgr.w1, w, h);
273 }
274 #endif
275