• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MSA version of rescaling functions
11 //
12 // Author: Prashant Patil (prashant.patil@imgtec.com)
13 
14 #include "src/dsp/dsp.h"
15 
16 #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
17 
18 #include <assert.h>
19 
20 #include "src/utils/rescaler_utils.h"
21 #include "src/dsp/msa_macro.h"
22 
23 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
24 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
25 
26 #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
27   v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
28   v16u8 t0, t1, t2, t3, t4, t5;                                       \
29   v2u64 out0, out1, out2, out3;                                       \
30   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \
31   ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \
32   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
33   DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
34   SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
35   PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \
36   ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \
37   ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \
38   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
39   DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
40   SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
41   PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \
42   PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \
43   dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \
44 } while (0)
45 
46 #define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \
47   v4u32 tmp0, tmp1;                                   \
48   v16i8 t0, t1;                                       \
49   v2u64 out0, out1;                                   \
50   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \
51   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \
52   SRAR_D2_UD(out0, out1, shift);                      \
53   t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \
54   t1 = __msa_pckev_b(t0, t0);                         \
55   t0 = __msa_pckev_b(t1, t1);                         \
56   dst = __msa_copy_s_w((v4i32)t0, 0);                 \
57 } while (0)
58 
59 #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \
60                           dst0, dst1, dst2, dst3) do {         \
61   v4u32 tmp0, tmp1, tmp2, tmp3;                                \
62   v2u64 out0, out1, out2, out3;                                \
63   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \
64   ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \
65   DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
66   DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
67   SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
68   PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \
69   ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \
70   ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \
71   DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
72   DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
73   SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
74   PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \
75 } while (0)
76 
77 #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \
78   v4u32 tmp0, tmp1;                                      \
79   v2u64 out0, out1;                                      \
80   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \
81   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \
82   SRAR_D2_UD(out0, out1, shift);                         \
83   dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \
84 } while (0)
85 
86 #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \
87                           dst0, dst1) do {                         \
88   v4u32 tmp0, tmp1, tmp2, tmp3;                                    \
89   v2u64 out0, out1, out2, out3;                                    \
90   ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \
91   ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \
92   DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \
93   DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \
94   SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
95   DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \
96   DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \
97   SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
98   PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \
99 } while (0)
100 
101 #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \
102   v4u32 tmp0, tmp1;                                               \
103   v2u64 out0, out1;                                               \
104   v16i8 t0, t1;                                                   \
105   ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \
106   DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \
107   SRAR_D2_UD(out0, out1, shift);                                  \
108   DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \
109   SRAR_D2_UD(out0, out1, shift);                                  \
110   t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \
111   t1 = __msa_pckev_b(t0, t0);                                     \
112   t0 = __msa_pckev_b(t1, t1);                                     \
113   dst = __msa_copy_s_w((v4i32)t0, 0);                             \
114 } while (0)
115 
ExportRowExpand_0(const uint32_t * frow,uint8_t * dst,int length,WebPRescaler * const wrk)116 static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
117                                           int length,
118                                           WebPRescaler* const wrk) {
119   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
120   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
121   const v4i32 zero = { 0 };
122 
123   while (length >= 16) {
124     v4u32 src0, src1, src2, src3;
125     v16u8 out;
126     LD_UW4(frow, 4, src0, src1, src2, src3);
127     CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
128     ST_UB(out, dst);
129     length -= 16;
130     frow   += 16;
131     dst    += 16;
132   }
133   if (length > 0) {
134     int x_out;
135     if (length >= 12) {
136       uint32_t val0_m, val1_m, val2_m;
137       v4u32 src0, src1, src2;
138       LD_UW3(frow, 4, src0, src1, src2);
139       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
140       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
141       CALC_MULT_FIX_4(src2, scale, shift, val2_m);
142       SW3(val0_m, val1_m, val2_m, dst, 4);
143       length -= 12;
144       frow   += 12;
145       dst    += 12;
146     } else if (length >= 8) {
147       uint32_t val0_m, val1_m;
148       v4u32 src0, src1;
149       LD_UW2(frow, 4, src0, src1);
150       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
151       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
152       SW2(val0_m, val1_m, dst, 4);
153       length -= 8;
154       frow   += 8;
155       dst    += 8;
156     } else if (length >= 4) {
157       uint32_t val0_m;
158       const v4u32 src0 = LD_UW(frow);
159       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
160       SW(val0_m, dst);
161       length -= 4;
162       frow   += 4;
163       dst    += 4;
164     }
165     for (x_out = 0; x_out < length; ++x_out) {
166       const uint32_t J = frow[x_out];
167       const int v = (int)MULT_FIX(J, wrk->fy_scale);
168       assert(v >= 0 && v <= 255);
169       dst[x_out] = v;
170     }
171   }
172 }
173 
ExportRowExpand_1(const uint32_t * frow,uint32_t * irow,uint8_t * dst,int length,WebPRescaler * const wrk)174 static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
175                                           uint8_t* dst, int length,
176                                           WebPRescaler* const wrk) {
177   const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
178   const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
179   const v4i32 B1 = __msa_fill_w(B);
180   const v4i32 A1 = __msa_fill_w(A);
181   const v4i32 AB = __msa_ilvr_w(A1, B1);
182   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
183   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
184 
185   while (length >= 16) {
186     v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
187     v16u8 t0, t1, t2, t3, t4, t5;
188     LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
189     LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
190     CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
191     CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
192     PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
193     t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
194     ST_UB(t0, dst);
195     frow   += 16;
196     irow   += 16;
197     dst    += 16;
198     length -= 16;
199   }
200   if (length > 0) {
201     int x_out;
202     if (length >= 12) {
203       uint32_t val0_m, val1_m, val2_m;
204       v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
205       LD_UW3(frow, 4, frow0, frow1, frow2);
206       LD_UW3(irow, 4, irow0, irow1, irow2);
207       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
208       CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
209       CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
210       SW3(val0_m, val1_m, val2_m, dst, 4);
211       frow   += 12;
212       irow   += 12;
213       dst    += 12;
214       length -= 12;
215     } else if (length >= 8) {
216       uint32_t val0_m, val1_m;
217       v4u32 frow0, frow1, irow0, irow1;
218       LD_UW2(frow, 4, frow0, frow1);
219       LD_UW2(irow, 4, irow0, irow1);
220       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
221       CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
222       SW2(val0_m, val1_m, dst, 4);
223       frow   += 4;
224       irow   += 4;
225       dst    += 4;
226       length -= 4;
227     } else if (length >= 4) {
228       uint32_t val0_m;
229       const v4u32 frow0 = LD_UW(frow + 0);
230       const v4u32 irow0 = LD_UW(irow + 0);
231       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
232       SW(val0_m, dst);
233       frow   += 4;
234       irow   += 4;
235       dst    += 4;
236       length -= 4;
237     }
238     for (x_out = 0; x_out < length; ++x_out) {
239       const uint64_t I = (uint64_t)A * frow[x_out]
240                        + (uint64_t)B * irow[x_out];
241       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
242       const int v = (int)MULT_FIX(J, wrk->fy_scale);
243       assert(v >= 0 && v <= 255);
244       dst[x_out] = v;
245     }
246   }
247 }
248 
RescalerExportRowExpand_MIPSdspR2(WebPRescaler * const wrk)249 static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
250   uint8_t* dst = wrk->dst;
251   rescaler_t* irow = wrk->irow;
252   const int x_out_max = wrk->dst_width * wrk->num_channels;
253   const rescaler_t* frow = wrk->frow;
254   assert(!WebPRescalerOutputDone(wrk));
255   assert(wrk->y_accum <= 0);
256   assert(wrk->y_expand);
257   assert(wrk->y_sub != 0);
258   if (wrk->y_accum == 0) {
259     ExportRowExpand_0(frow, dst, x_out_max, wrk);
260   } else {
261     ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
262   }
263 }
264 
ExportRowShrink_0(const uint32_t * frow,uint32_t * irow,uint8_t * dst,int length,const uint32_t yscale,WebPRescaler * const wrk)265 static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
266                                           uint8_t* dst, int length,
267                                           const uint32_t yscale,
268                                           WebPRescaler* const wrk) {
269   const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
270   const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
271   const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
272   const v4i32 zero = { 0 };
273 
274   while (length >= 16) {
275     v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
276     v16u8 out;
277     LD_UW4(frow, 4, src0, src1, src2, src3);
278     CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
279                       frac0, frac1, frac2, frac3);
280     LD_UW4(irow, 4, src0, src1, src2, src3);
281     SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
282          src0, src1, src2, src3);
283     CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
284     ST_UB(out, dst);
285     ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
286     frow   += 16;
287     irow   += 16;
288     dst    += 16;
289     length -= 16;
290   }
291   if (length > 0) {
292     int x_out;
293     if (length >= 12) {
294       uint32_t val0_m, val1_m, val2_m;
295       v4u32 src0, src1, src2, frac0, frac1, frac2;
296       LD_UW3(frow, 4, src0, src1, src2);
297       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
298       CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
299       CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
300       LD_UW3(irow, 4, src0, src1, src2);
301       SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
302       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
303       CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
304       CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
305       SW3(val0_m, val1_m, val2_m, dst, 4);
306       ST_UW3(frac0, frac1, frac2, irow, 4);
307       frow   += 12;
308       irow   += 12;
309       dst    += 12;
310       length -= 12;
311     } else if (length >= 8) {
312       uint32_t val0_m, val1_m;
313       v4u32 src0, src1, frac0, frac1;
314       LD_UW2(frow, 4, src0, src1);
315       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
316       CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
317       LD_UW2(irow, 4, src0, src1);
318       SUB2(src0, frac0, src1, frac1, src0, src1);
319       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
320       CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
321       SW2(val0_m, val1_m, dst, 4);
322       ST_UW2(frac0, frac1, irow, 4);
323       frow   += 8;
324       irow   += 8;
325       dst    += 8;
326       length -= 8;
327     } else if (length >= 4) {
328       uint32_t val0_m;
329       v4u32 frac0;
330       v4u32 src0 = LD_UW(frow);
331       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
332       src0 = LD_UW(irow);
333       src0 = src0 - frac0;
334       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
335       SW(val0_m, dst);
336       ST_UW(frac0, irow);
337       frow   += 4;
338       irow   += 4;
339       dst    += 4;
340       length -= 4;
341     }
342     for (x_out = 0; x_out < length; ++x_out) {
343       const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
344       const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
345       assert(v >= 0 && v <= 255);
346       dst[x_out] = v;
347       irow[x_out] = frac;
348     }
349   }
350 }
351 
ExportRowShrink_1(uint32_t * irow,uint8_t * dst,int length,WebPRescaler * const wrk)352 static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
353                                           int length,
354                                           WebPRescaler* const wrk) {
355   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
356   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
357   const v4i32 zero = { 0 };
358 
359   while (length >= 16) {
360     v4u32 src0, src1, src2, src3;
361     v16u8 dst0;
362     LD_UW4(irow, 4, src0, src1, src2, src3);
363     CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
364     ST_UB(dst0, dst);
365     ST_SW4(zero, zero, zero, zero, irow, 4);
366     length -= 16;
367     irow   += 16;
368     dst    += 16;
369   }
370   if (length > 0) {
371     int x_out;
372     if (length >= 12) {
373       uint32_t val0_m, val1_m, val2_m;
374       v4u32 src0, src1, src2;
375       LD_UW3(irow, 4, src0, src1, src2);
376       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
377       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
378       CALC_MULT_FIX_4(src2, scale, shift, val2_m);
379       SW3(val0_m, val1_m, val2_m, dst, 4);
380       ST_SW3(zero, zero, zero, irow, 4);
381       length -= 12;
382       irow   += 12;
383       dst    += 12;
384     } else if (length >= 8) {
385       uint32_t val0_m, val1_m;
386       v4u32 src0, src1;
387       LD_UW2(irow, 4, src0, src1);
388       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
389       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
390       SW2(val0_m, val1_m, dst, 4);
391       ST_SW2(zero, zero, irow, 4);
392       length -= 8;
393       irow   += 8;
394       dst    += 8;
395     } else if (length >= 4) {
396       uint32_t val0_m;
397       const v4u32 src0 = LD_UW(irow + 0);
398       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
399       SW(val0_m, dst);
400       ST_SW(zero, irow);
401       length -= 4;
402       irow   += 4;
403       dst    += 4;
404     }
405     for (x_out = 0; x_out < length; ++x_out) {
406       const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
407       assert(v >= 0 && v <= 255);
408       dst[x_out] = v;
409       irow[x_out] = 0;
410     }
411   }
412 }
413 
RescalerExportRowShrink_MIPSdspR2(WebPRescaler * const wrk)414 static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
415   uint8_t* dst = wrk->dst;
416   rescaler_t* irow = wrk->irow;
417   const int x_out_max = wrk->dst_width * wrk->num_channels;
418   const rescaler_t* frow = wrk->frow;
419   const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
420   assert(!WebPRescalerOutputDone(wrk));
421   assert(wrk->y_accum <= 0);
422   assert(!wrk->y_expand);
423   if (yscale) {
424     ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
425   } else {
426     ExportRowShrink_1(irow, dst, x_out_max, wrk);
427   }
428 }
429 
430 //------------------------------------------------------------------------------
431 // Entry point
432 
433 extern void WebPRescalerDspInitMSA(void);
434 
WebPRescalerDspInitMSA(void)435 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
436   WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
437   WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
438 }
439 
440 #else     // !WEBP_USE_MSA
441 
442 WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
443 
444 #endif    // WEBP_USE_MSA
445