1 /*
2 * WMV2 - DSP functions Loongson MMI-optimized
3 *
4 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #include "libavutil/avassert.h"
24 #include "constants.h"
25 #include "wmv2dsp_mips.h"
26 #include "libavutil/mips/mmiutils.h"
27
28 #define W0 2048
29 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
30 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
31 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
32 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
33 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
34 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
35 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
36
wmv2_idct_row_mmi(short * b)37 static void wmv2_idct_row_mmi(short * b)
38 {
39 int s1, s2;
40 int a0, a1, a2, a3, a4, a5, a6, a7;
41
42 /* step 1 */
43 a0 = W0 * b[0] + W0 * b[4];
44 a1 = W1 * b[1] + W7 * b[7];
45 a2 = W2 * b[2] + W6 * b[6];
46 a3 = W3 * b[5] - W5 * b[3];
47 a4 = W0 * b[0] - W0 * b[4];
48 a5 = W5 * b[5] + W3 * b[3];
49 a6 = W6 * b[2] - W2 * b[6];
50 a7 = W7 * b[1] - W1 * b[7];
51
52 /* step 2 */
53 s1 = (181 * (a1 - a5 + a7 - a3) + 128) >> 8; // 1, 3, 5, 7
54 s2 = (181 * (a1 - a5 - a7 + a3) + 128) >> 8;
55
56 /* step 3 */
57 b[0] = (a0 + a2 + a1 + a5 + 128) >> 8;
58 b[1] = (a4 + a6 + s1 + 128) >> 8;
59 b[2] = (a4 - a6 + s2 + 128) >> 8;
60 b[3] = (a0 - a2 + a7 + a3 + 128) >> 8;
61 b[4] = (a0 - a2 - a7 - a3 + 128) >> 8;
62 b[5] = (a4 - a6 - s2 + 128) >> 8;
63 b[6] = (a4 + a6 - s1 + 128) >> 8;
64 b[7] = (a0 + a2 - a1 - a5 + 128) >> 8;
65 }
66
wmv2_idct_col_mmi(short * b)67 static void wmv2_idct_col_mmi(short * b)
68 {
69 int s1, s2;
70 int a0, a1, a2, a3, a4, a5, a6, a7;
71
72 /* step 1, with extended precision */
73 a0 = (W0 * b[ 0] + W0 * b[32] ) >> 3;
74 a1 = (W1 * b[ 8] + W7 * b[56] + 4) >> 3;
75 a2 = (W2 * b[16] + W6 * b[48] + 4) >> 3;
76 a3 = (W3 * b[40] - W5 * b[24] + 4) >> 3;
77 a4 = (W0 * b[ 0] - W0 * b[32] ) >> 3;
78 a5 = (W5 * b[40] + W3 * b[24] + 4) >> 3;
79 a6 = (W6 * b[16] - W2 * b[48] + 4) >> 3;
80 a7 = (W7 * b[ 8] - W1 * b[56] + 4) >> 3;
81
82 /* step 2 */
83 s1 = (181 * (a1 - a5 + a7 - a3) + 128) >> 8;
84 s2 = (181 * (a1 - a5 - a7 + a3) + 128) >> 8;
85
86 /* step 3 */
87 b[ 0] = (a0 + a2 + a1 + a5 + 8192) >> 14;
88 b[ 8] = (a4 + a6 + s1 + 8192) >> 14;
89 b[16] = (a4 - a6 + s2 + 8192) >> 14;
90 b[24] = (a0 - a2 + a7 + a3 + 8192) >> 14;
91
92 b[32] = (a0 - a2 - a7 - a3 + 8192) >> 14;
93 b[40] = (a4 - a6 - s2 + 8192) >> 14;
94 b[48] = (a4 + a6 - s1 + 8192) >> 14;
95 b[56] = (a0 + a2 - a1 - a5 + 8192) >> 14;
96 }
97
ff_wmv2_idct_add_mmi(uint8_t * dest,ptrdiff_t line_size,int16_t * block)98 void ff_wmv2_idct_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
99 {
100 int i;
101 double ftmp[11];
102
103 for (i = 0; i < 64; i += 8)
104 wmv2_idct_row_mmi(block + i);
105 for (i = 0; i < 8; i++)
106 wmv2_idct_col_mmi(block + i);
107
108 __asm__ volatile (
109 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
110
111 // low 4 loop
112 MMI_LDC1(%[ftmp1], %[block], 0x00)
113 MMI_LDC1(%[ftmp2], %[block], 0x08)
114 MMI_LDC1(%[ftmp3], %[block], 0x10)
115 MMI_LDC1(%[ftmp4], %[block], 0x18)
116 MMI_LDC1(%[ftmp5], %[block], 0x20)
117 MMI_LDC1(%[ftmp6], %[block], 0x28)
118 MMI_LDC1(%[ftmp7], %[block], 0x30)
119 MMI_LDC1(%[ftmp8], %[block], 0x38)
120
121 MMI_LDC1(%[ftmp9], %[dest], 0x00)
122 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t"
123 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
124 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
125 "paddh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
126 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
127 MMI_SDC1(%[ftmp1], %[dest], 0x00)
128 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
129
130 MMI_LDC1(%[ftmp9], %[dest], 0x00)
131 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t"
132 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
133 "paddh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
134 "paddh %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
135 "packushb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
136 MMI_SDC1(%[ftmp3], %[dest], 0x00)
137 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
138
139 MMI_LDC1(%[ftmp9], %[dest], 0x00)
140 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t"
141 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
142 "paddh %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
143 "paddh %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
144 "packushb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
145 MMI_SDC1(%[ftmp5], %[dest], 0x00)
146 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
147
148 MMI_LDC1(%[ftmp9], %[dest], 0x00)
149 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t"
150 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
151 "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
152 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
153 "packushb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
154 MMI_SDC1(%[ftmp7], %[dest], 0x00)
155
156 PTR_ADDIU "%[block], %[block], 0x40 \n\t"
157 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
158
159 // high 4 loop
160 MMI_LDC1(%[ftmp1], %[block], 0x00)
161 MMI_LDC1(%[ftmp2], %[block], 0x08)
162 MMI_LDC1(%[ftmp3], %[block], 0x10)
163 MMI_LDC1(%[ftmp4], %[block], 0x18)
164 MMI_LDC1(%[ftmp5], %[block], 0x20)
165 MMI_LDC1(%[ftmp6], %[block], 0x28)
166 MMI_LDC1(%[ftmp7], %[block], 0x30)
167 MMI_LDC1(%[ftmp8], %[block], 0x38)
168
169 MMI_LDC1(%[ftmp9], %[dest], 0x00)
170 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t"
171 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
172 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
173 "paddh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
174 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
175 MMI_SDC1(%[ftmp1], %[dest], 0x00)
176 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
177
178 MMI_LDC1(%[ftmp9], %[dest], 0x00)
179 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t"
180 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
181 "paddh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
182 "paddh %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
183 "packushb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
184 MMI_SDC1(%[ftmp3], %[dest], 0x00)
185 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
186
187 MMI_LDC1(%[ftmp9], %[dest], 0x00)
188 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t"
189 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
190 "paddh %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
191 "paddh %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
192 "packushb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
193 MMI_SDC1(%[ftmp5], %[dest], 0x00)
194 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
195
196 MMI_LDC1(%[ftmp9], %[dest], 0x00)
197 "punpckhbh %[ftmp10], %[ftmp9], %[ftmp0] \n\t"
198 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
199 "paddh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
200 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
201 "packushb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
202 MMI_SDC1(%[ftmp7], %[dest], 0x00)
203 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
204 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
205 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
206 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
207 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
208 [ftmp10]"=&f"(ftmp[10]),
209 [block]"+&r"(block), [dest]"+&r"(dest)
210 : [line_size]"r"((mips_reg)line_size)
211 : "memory"
212 );
213 }
214
ff_wmv2_idct_put_mmi(uint8_t * dest,ptrdiff_t line_size,int16_t * block)215 void ff_wmv2_idct_put_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
216 {
217 int i;
218 double ftmp[8];
219
220 for (i = 0; i < 64; i += 8)
221 wmv2_idct_row_mmi(block + i);
222 for (i = 0; i < 8; i++)
223 wmv2_idct_col_mmi(block + i);
224
225 __asm__ volatile (
226 // low 4 loop
227 MMI_LDC1(%[ftmp0], %[block], 0x00)
228 MMI_LDC1(%[ftmp1], %[block], 0x08)
229 MMI_LDC1(%[ftmp2], %[block], 0x10)
230 MMI_LDC1(%[ftmp3], %[block], 0x18)
231 MMI_LDC1(%[ftmp4], %[block], 0x20)
232 MMI_LDC1(%[ftmp5], %[block], 0x28)
233 MMI_LDC1(%[ftmp6], %[block], 0x30)
234 MMI_LDC1(%[ftmp7], %[block], 0x38)
235 "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
236 "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
237 "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
238 "packushb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
239 MMI_SDC1(%[ftmp0], %[dest], 0x00)
240 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
241 MMI_SDC1(%[ftmp2], %[dest], 0x00)
242 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
243 MMI_SDC1(%[ftmp4], %[dest], 0x00)
244 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
245 MMI_SDC1(%[ftmp6], %[dest], 0x00)
246
247 PTR_ADDIU "%[block], %[block], 0x40 \n\t"
248 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
249
250 // high 4 loop
251 MMI_LDC1(%[ftmp0], %[block], 0x00)
252 MMI_LDC1(%[ftmp1], %[block], 0x08)
253 MMI_LDC1(%[ftmp2], %[block], 0x10)
254 MMI_LDC1(%[ftmp3], %[block], 0x18)
255 MMI_LDC1(%[ftmp4], %[block], 0x20)
256 MMI_LDC1(%[ftmp5], %[block], 0x28)
257 MMI_LDC1(%[ftmp6], %[block], 0x30)
258 MMI_LDC1(%[ftmp7], %[block], 0x38)
259 "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
260 "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
261 "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
262 "packushb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
263 MMI_SDC1(%[ftmp0], %[dest], 0x00)
264 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
265 MMI_SDC1(%[ftmp2], %[dest], 0x00)
266 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
267 MMI_SDC1(%[ftmp4], %[dest], 0x00)
268 PTR_ADDU "%[dest], %[dest], %[line_size] \n\t"
269 MMI_SDC1(%[ftmp6], %[dest], 0x00)
270 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
271 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
272 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
273 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
274 [block]"+&r"(block), [dest]"+&r"(dest)
275 : [line_size]"r"((mips_reg)line_size)
276 : "memory"
277 );
278 }
279