1 /*
2 * Loongson SIMD optimized qpeldsp
3 *
4 * Copyright (c) 2016 Loongson Technology Corporation Limited
5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "hpeldsp_mips.h"
25 #include "libavcodec/bit_depth_template.c"
26 #include "libavutil/mips/mmiutils.h"
27 #include "constants.h"
28
ff_put_pixels4_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)29 void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
30 ptrdiff_t line_size, int h)
31 {
32 double ftmp[4];
33 DECLARE_VAR_LOW32;
34
35 __asm__ volatile (
36 "1: \n\t"
37 MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
38 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
39 MMI_ULWC1(%[ftmp1], %[pixels], 0x00)
40 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
41
42 PTR_ADDI "%[h], %[h], -0x02 \n\t"
43
44 MMI_SWC1(%[ftmp0], %[block], 0x00)
45 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
46 MMI_SWC1(%[ftmp1], %[block], 0x00)
47 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
48
49 "bnez %[h], 1b \n\t"
50 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
51 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
52 RESTRICT_ASM_LOW32
53 [block]"+&r"(block), [pixels]"+&r"(pixels),
54 [h]"+&r"(h)
55 : [line_size]"r"((mips_reg)line_size)
56 : "memory"
57 );
58 }
59
ff_put_pixels8_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)60 void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
61 ptrdiff_t line_size, int h)
62 {
63 double ftmp[4];
64 DECLARE_VAR_ALL64;
65
66 __asm__ volatile (
67 "1: \n\t"
68 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
69 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
70 MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
71 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
72 MMI_ULDC1(%[ftmp2], %[pixels], 0x00)
73 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
74 MMI_ULDC1(%[ftmp3], %[pixels], 0x00)
75 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
76
77 PTR_ADDI "%[h], %[h], -0x04 \n\t"
78
79 MMI_SDC1(%[ftmp0], %[block], 0x00)
80 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
81 MMI_SDC1(%[ftmp1], %[block], 0x00)
82 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
83 MMI_SDC1(%[ftmp2], %[block], 0x00)
84 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
85 MMI_SDC1(%[ftmp3], %[block], 0x00)
86 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
87
88 "bnez %[h], 1b \n\t"
89 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
90 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
91 RESTRICT_ASM_ALL64
92 [block]"+&r"(block), [pixels]"+&r"(pixels),
93 [h]"+&r"(h)
94 : [line_size]"r"((mips_reg)line_size)
95 : "memory"
96 );
97 }
98
ff_put_pixels16_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)99 void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
100 ptrdiff_t line_size, int h)
101 {
102 double ftmp[8];
103 DECLARE_VAR_ALL64;
104
105 __asm__ volatile (
106 "1: \n\t"
107 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
108 MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
109 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
110 MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
111 MMI_ULDC1(%[ftmp3], %[pixels], 0x08)
112 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
113 MMI_ULDC1(%[ftmp4], %[pixels], 0x00)
114 MMI_ULDC1(%[ftmp6], %[pixels], 0x08)
115 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
116 MMI_ULDC1(%[ftmp5], %[pixels], 0x00)
117 MMI_ULDC1(%[ftmp7], %[pixels], 0x08)
118 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
119
120 PTR_ADDI "%[h], %[h], -0x04 \n\t"
121
122 MMI_SDC1(%[ftmp0], %[block], 0x00)
123 MMI_SDC1(%[ftmp2], %[block], 0x08)
124 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
125 MMI_SDC1(%[ftmp1], %[block], 0x00)
126 MMI_SDC1(%[ftmp3], %[block], 0x08)
127 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
128 MMI_SDC1(%[ftmp4], %[block], 0x00)
129 MMI_SDC1(%[ftmp6], %[block], 0x08)
130 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
131 MMI_SDC1(%[ftmp5], %[block], 0x00)
132 MMI_SDC1(%[ftmp7], %[block], 0x08)
133 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
134
135 "bnez %[h], 1b \n\t"
136 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
137 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
138 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
139 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
140 RESTRICT_ASM_ALL64
141 [block]"+&r"(block), [pixels]"+&r"(pixels),
142 [h]"+&r"(h)
143 : [line_size]"r"((mips_reg)line_size)
144 : "memory"
145 );
146 }
147
ff_avg_pixels4_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)148 void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
149 ptrdiff_t line_size, int h)
150 {
151 double ftmp[4];
152 mips_reg addr[2];
153 DECLARE_VAR_LOW32;
154
155 __asm__ volatile (
156 "1: \n\t"
157 PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
158 MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
159 MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
160 PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
161 MMI_ULWC1(%[ftmp2], %[block], 0x00)
162 MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
163
164 PTR_ADDI "%[h], %[h], -0x02 \n\t"
165
166 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
167 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
168 MMI_SWC1(%[ftmp0], %[block], 0x00)
169 MMI_SWC1(%[ftmp1], %[addr1], 0x00)
170 PTR_ADDU "%[pixels], %[addr0], %[line_size] \n\t"
171 PTR_ADDU "%[block], %[addr1], %[line_size] \n\t"
172
173 "bnez %[h], 1b \n\t"
174 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
175 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
176 RESTRICT_ASM_LOW32
177 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
178 [block]"+&r"(block), [pixels]"+&r"(pixels),
179 [h]"+&r"(h)
180 : [line_size]"r"((mips_reg)line_size)
181 : "memory"
182 );
183 }
184
ff_avg_pixels8_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)185 void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
186 ptrdiff_t line_size, int h)
187 {
188 double ftmp[4];
189 mips_reg addr[3];
190 DECLARE_VAR_ALL64;
191 DECLARE_VAR_ADDRT;
192
193 __asm__ volatile (
194 PTR_ADDU "%[addr2], %[line_size], %[line_size] \n\t"
195 "1: \n\t"
196 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
197 PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
198 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
199 PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
200 MMI_ULDC1(%[ftmp2], %[block], 0x00)
201 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
202 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
203 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
204 MMI_SDC1(%[ftmp0], %[block], 0x00)
205 MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
206 PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t"
207 PTR_ADDU "%[block], %[block], %[addr2] \n\t"
208
209 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
210 PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
211 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
212 PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
213 MMI_ULDC1(%[ftmp2], %[block], 0x00)
214 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
215 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
216 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
217 MMI_SDC1(%[ftmp0], %[block], 0x00)
218 MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
219 PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t"
220 PTR_ADDU "%[block], %[block], %[addr2] \n\t"
221
222 PTR_ADDI "%[h], %[h], -0x04 \n\t"
223 "bnez %[h], 1b \n\t"
224 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
225 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
226 RESTRICT_ASM_ALL64
227 RESTRICT_ASM_ADDRT
228 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
229 [addr2]"=&r"(addr[2]),
230 [block]"+&r"(block), [pixels]"+&r"(pixels),
231 [h]"+&r"(h)
232 : [line_size]"r"((mips_reg)line_size)
233 : "memory"
234 );
235 }
236
ff_avg_pixels16_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)237 void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
238 ptrdiff_t line_size, int h)
239 {
240 double ftmp[8];
241 mips_reg addr[1];
242 DECLARE_VAR_ALL64;
243
244 __asm__ volatile (
245 "1: \n\t"
246 PTR_ADDI "%[h], %[h], -0x04 \n\t"
247 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
248 MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
249 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
250 MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
251 MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
252 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
253 MMI_ULDC1(%[ftmp2], %[block], 0x00)
254 MMI_ULDC1(%[ftmp6], %[block], 0x08)
255 PTR_ADDU "%[addr0], %[block], %[line_size] \n\t"
256 MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
257 MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
258 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
259 "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
260 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
261 "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
262 MMI_SDC1(%[ftmp0], %[block], 0x00)
263 MMI_SDC1(%[ftmp4], %[block], 0x08)
264 MMI_SDC1(%[ftmp1], %[addr0], 0x00)
265 MMI_SDC1(%[ftmp5], %[addr0], 0x08)
266 PTR_ADDU "%[block], %[addr0], %[line_size] \n\t"
267
268 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
269 MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
270 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
271 MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
272 MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
273 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
274 MMI_ULDC1(%[ftmp2], %[block], 0x00)
275 MMI_ULDC1(%[ftmp6], %[block], 0x08)
276 PTR_ADDU "%[addr0], %[block], %[line_size] \n\t"
277 MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
278 MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
279 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
280 "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
281 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
282 "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
283 MMI_SDC1(%[ftmp0], %[block], 0x00)
284 MMI_SDC1(%[ftmp4], %[block], 0x08)
285 MMI_SDC1(%[ftmp1], %[addr0], 0x00)
286 MMI_SDC1(%[ftmp5], %[addr0], 0x08)
287 PTR_ADDU "%[block], %[addr0], %[line_size] \n\t"
288
289 "bnez %[h], 1b \n\t"
290 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
291 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
292 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
293 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
294 RESTRICT_ASM_ALL64
295 [addr0]"=&r"(addr[0]),
296 [block]"+&r"(block), [pixels]"+&r"(pixels),
297 [h]"+&r"(h)
298 : [line_size]"r"((mips_reg)line_size)
299 : "memory"
300 );
301 }
302
ff_put_pixels4_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)303 inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
304 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
305 int h)
306 {
307 double ftmp[4];
308 mips_reg addr[5];
309 DECLARE_VAR_LOW32;
310
311 __asm__ volatile (
312 "1: \n\t"
313 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
314 MMI_ULWC1(%[ftmp0], %[src1], 0x00)
315 MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
316 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
317 MMI_ULWC1(%[ftmp2], %[src2], 0x00)
318 MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
319 PTR_ADDU "%[src1], %[addr0], %[src_stride1] \n\t"
320 PTR_ADDU "%[src2], %[addr1], %[src_stride2] \n\t"
321
322 PTR_ADDI "%[h], %[h], -0x02 \n\t"
323
324 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
325 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
326 MMI_SWC1(%[ftmp0], %[dst], 0x00)
327 PTR_ADDU "%[dst], %[dst], %[dst_stride] \n\t"
328 MMI_SWC1(%[ftmp1], %[dst], 0x00)
329 PTR_ADDU "%[dst], %[dst], %[dst_stride] \n\t"
330
331 "bnez %[h], 1b \n\t"
332 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
333 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
334 RESTRICT_ASM_LOW32
335 RESTRICT_ASM_ADDRT
336 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
337 [dst]"+&r"(dst), [src1]"+&r"(src1),
338 [src2]"+&r"(src2), [h]"+&r"(h)
339 : [dst_stride]"r"((mips_reg)dst_stride),
340 [src_stride1]"r"((mips_reg)src_stride1),
341 [src_stride2]"r"((mips_reg)src_stride2)
342 : "memory"
343 );
344 }
345
ff_put_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)346 inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
347 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
348 int h)
349 {
350 double ftmp[4];
351 mips_reg addr[5];
352 DECLARE_VAR_ALL64;
353 DECLARE_VAR_ADDRT;
354
355 __asm__ volatile (
356 PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
357 PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
358 PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
359
360 "1: \n\t"
361 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
362 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
363 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
364 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
365 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
366 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
367 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
368 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
369 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
370 MMI_SDC1(%[ftmp0], %[dst], 0x00)
371 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
372 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
373 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
374
375 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
376 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
377 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
378 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
379 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
380 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
381 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
382 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
383 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
384 MMI_SDC1(%[ftmp0], %[dst], 0x00)
385 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
386 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
387 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
388
389 PTR_ADDI "%[h], %[h], -0x04 \n\t"
390 "bnez %[h], 1b \n\t"
391 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
392 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
393 RESTRICT_ASM_ALL64
394 RESTRICT_ASM_ADDRT
395 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
396 [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
397 [addr4]"=&r"(addr[4]),
398 [dst]"+&r"(dst), [src1]"+&r"(src1),
399 [src2]"+&r"(src2), [h]"+&r"(h)
400 : [dst_stride]"r"((mips_reg)dst_stride),
401 [src_stride1]"r"((mips_reg)src_stride1),
402 [src_stride2]"r"((mips_reg)src_stride2)
403 : "memory"
404 );
405 }
406
ff_put_pixels16_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)407 inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
408 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
409 int h)
410 {
411 double ftmp[8];
412 mips_reg addr[5];
413 DECLARE_VAR_ALL64;
414 DECLARE_VAR_ADDRT;
415
416 __asm__ volatile (
417 PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
418 PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
419 PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
420
421 "1: \n\t"
422 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
423 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
424 MMI_ULDC1(%[ftmp4], %[src1], 0x08)
425 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
426 MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
427 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
428 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
429 MMI_ULDC1(%[ftmp6], %[src2], 0x08)
430 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
431 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
432 MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
433 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
434 "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
435 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
436 "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
437 MMI_SDC1(%[ftmp0], %[dst], 0x00)
438 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
439 MMI_SDC1(%[ftmp4], %[dst], 0x08)
440 MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
441 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
442 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
443
444 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
445 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
446 MMI_ULDC1(%[ftmp4], %[src1], 0x08)
447 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
448 MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
449 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
450 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
451 MMI_ULDC1(%[ftmp6], %[src2], 0x08)
452 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
453 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
454 MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
455 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
456 "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
457 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
458 "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
459 MMI_SDC1(%[ftmp0], %[dst], 0x00)
460 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
461 MMI_SDC1(%[ftmp4], %[dst], 0x08)
462 MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
463 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
464 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
465
466 PTR_ADDI "%[h], %[h], -0x04 \n\t"
467 "bnez %[h], 1b \n\t"
468 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
469 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
470 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
471 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
472 RESTRICT_ASM_ALL64
473 RESTRICT_ASM_ADDRT
474 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
475 [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
476 [addr4]"=&r"(addr[4]),
477 [dst]"+&r"(dst), [src1]"+&r"(src1),
478 [src2]"+&r"(src2), [h]"+&r"(h)
479 : [dst_stride]"r"((mips_reg)dst_stride),
480 [src_stride1]"r"((mips_reg)src_stride1),
481 [src_stride2]"r"((mips_reg)src_stride2)
482 : "memory"
483 );
484 }
485
ff_avg_pixels4_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)486 inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
487 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
488 int h)
489 {
490 double ftmp[6];
491 mips_reg addr[6];
492 DECLARE_VAR_LOW32;
493
494 __asm__ volatile (
495 "1: \n\t"
496 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
497 MMI_ULWC1(%[ftmp0], %[src1], 0x00)
498 MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
499 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
500 MMI_ULWC1(%[ftmp2], %[src2], 0x00)
501 MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
502 PTR_ADDU "%[src1], %[addr0], %[src_stride1] \n\t"
503 PTR_ADDU "%[src2], %[addr1], %[src_stride2] \n\t"
504 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
505 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
506 PTR_ADDU "%[addr2], %[dst], %[dst_stride] \n\t"
507 MMI_ULWC1(%[ftmp4], %[dst], 0x00)
508 MMI_ULWC1(%[ftmp5], %[addr2], 0x00)
509 PTR_ADDI "%[h], %[h], -0x02 \n\t"
510 "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
511 "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
512 MMI_SWC1(%[ftmp0], %[dst], 0x00)
513 MMI_SWC1(%[ftmp1], %[addr2], 0x00)
514 PTR_ADDU "%[dst], %[addr2], %[dst_stride] \n\t"
515
516 "bnez %[h], 1b \n\t"
517 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
518 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
519 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
520 RESTRICT_ASM_LOW32
521 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
522 [addr2]"=&r"(addr[2]),
523 [dst]"+&r"(dst), [src1]"+&r"(src1),
524 [src2]"+&r"(src2), [h]"+&r"(h)
525 : [dst_stride]"r"((mips_reg)dst_stride),
526 [src_stride1]"r"((mips_reg)src_stride1),
527 [src_stride2]"r"((mips_reg)src_stride2)
528 : "memory"
529 );
530 }
531
ff_avg_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)532 inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
533 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
534 int h)
535 {
536 double ftmp[6];
537 mips_reg addr[6];
538 DECLARE_VAR_ALL64;
539 DECLARE_VAR_ADDRT;
540
541 __asm__ volatile (
542 PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
543 PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
544 PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
545
546 "1: \n\t"
547 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
548 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
549 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
550 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
551 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
552 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
553 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
554 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
555 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
556 PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t"
557 MMI_ULDC1(%[ftmp4], %[dst], 0x00)
558 MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
559 "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
560 "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
561 MMI_SDC1(%[ftmp0], %[dst], 0x00)
562 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
563 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
564 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
565
566 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
567 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
568 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
569 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
570 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
571 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
572 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
573 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
574 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
575 PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t"
576 MMI_ULDC1(%[ftmp4], %[dst], 0x00)
577 MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
578 "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
579 "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
580 MMI_SDC1(%[ftmp0], %[dst], 0x00)
581 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
582 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
583 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
584
585 PTR_ADDI "%[h], %[h], -0x04 \n\t"
586 "bnez %[h], 1b \n\t"
587 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
588 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
589 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
590 RESTRICT_ASM_ALL64
591 RESTRICT_ASM_ADDRT
592 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
593 [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
594 [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
595 [dst]"+&r"(dst), [src1]"+&r"(src1),
596 [src2]"+&r"(src2), [h]"+&r"(h)
597 : [dst_stride]"r"((mips_reg)dst_stride),
598 [src_stride1]"r"((mips_reg)src_stride1),
599 [src_stride2]"r"((mips_reg)src_stride2)
600 : "memory"
601 );
602 }
603
ff_avg_pixels16_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)604 inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
605 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
606 int h)
607 {
608 ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1,
609 src_stride2, h);
610 ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride,
611 src_stride1, src_stride2, h);
612 }
613
ff_put_pixels4_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)614 void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
615 ptrdiff_t line_size, int h)
616 {
617 ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
618 line_size, h);
619 }
620
ff_put_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)621 void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
622 ptrdiff_t line_size, int h)
623 {
624 ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
625 line_size, h);
626 }
627
ff_put_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)628 void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
629 ptrdiff_t line_size, int h)
630 {
631 ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
632 line_size, h);
633 }
634
ff_avg_pixels4_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)635 void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
636 ptrdiff_t line_size, int h)
637 {
638 ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
639 line_size, h);
640 }
641
ff_avg_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)642 void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
643 ptrdiff_t line_size, int h)
644 {
645 ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
646 line_size, h);
647 }
648
ff_avg_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)649 void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
650 ptrdiff_t line_size, int h)
651 {
652 ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h);
653 ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
654 }
655
ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)656 inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
657 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
658 int h)
659 {
660 double ftmp[5];
661 mips_reg addr[5];
662 DECLARE_VAR_ALL64;
663 DECLARE_VAR_ADDRT;
664
665 __asm__ volatile (
666 "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
667 PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
668 PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
669 PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
670
671 "1: \n\t"
672 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
673 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
674 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
675 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
676 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
677 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
678 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
679 "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
680 "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
681 "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
682 "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
683 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
684 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
685 "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
686 "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
687 MMI_SDC1(%[ftmp0], %[dst], 0x00)
688 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
689 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
690 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
691
692 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
693 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
694 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
695 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
696 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
697 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
698 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
699 "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
700 "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
701 "xor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
702 "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
703 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
704 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
705 "xor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
706 "xor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
707 MMI_SDC1(%[ftmp0], %[dst], 0x00)
708 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
709 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
710 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
711
712 PTR_ADDI "%[h], %[h], -0x04 \n\t"
713 "bnez %[h], 1b \n\t"
714 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
715 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
716 [ftmp4]"=&f"(ftmp[4]),
717 RESTRICT_ASM_ALL64
718 RESTRICT_ASM_ADDRT
719 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
720 [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
721 [addr4]"=&r"(addr[4]),
722 [dst]"+&r"(dst), [src1]"+&r"(src1),
723 [src2]"+&r"(src2), [h]"+&r"(h)
724 : [dst_stride]"r"((mips_reg)dst_stride),
725 [src_stride1]"r"((mips_reg)src_stride1),
726 [src_stride2]"r"((mips_reg)src_stride2)
727 : "memory"
728 );
729 }
730
ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)731 void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
732 ptrdiff_t line_size, int h)
733 {
734 ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size,
735 line_size, line_size, h);
736 }
737
ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)738 void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
739 ptrdiff_t line_size, int h)
740 {
741 ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h);
742 ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
743 }
744
ff_put_pixels4_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)745 void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
746 ptrdiff_t line_size, int h)
747 {
748 ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
749 line_size, line_size, h);
750 }
751
ff_put_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)752 void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
753 ptrdiff_t line_size, int h)
754 {
755 ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
756 line_size, line_size, h);
757 }
758
ff_put_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)759 void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
760 ptrdiff_t line_size, int h)
761 {
762 ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size,
763 line_size, line_size, h);
764 }
765
ff_avg_pixels4_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)766 void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
767 ptrdiff_t line_size, int h)
768 {
769 ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
770 line_size, line_size, h);
771 }
772
ff_avg_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)773 void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
774 ptrdiff_t line_size, int h)
775 {
776 ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
777 line_size, line_size, h);
778 }
779
ff_avg_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)780 void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
781 ptrdiff_t line_size, int h)
782 {
783 ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h);
784 ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h);
785 }
786
ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)787 void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
788 ptrdiff_t line_size, int h)
789 {
790 ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size,
791 line_size, line_size, line_size, h);
792 }
793
ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)794 void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
795 ptrdiff_t line_size, int h)
796 {
797 ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h);
798 ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h);
799 }
800
ff_put_pixels4_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)801 void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
802 ptrdiff_t line_size, int h)
803 {
804 /* FIXME HIGH BIT DEPTH */
805 int i;
806 const uint32_t a = AV_RN32(pixels);
807 const uint32_t b = AV_RN32(pixels + 1);
808 uint32_t l0 = (a & 0x03030303UL) +
809 (b & 0x03030303UL) +
810 0x02020202UL;
811 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
812 ((b & 0xFCFCFCFCUL) >> 2);
813 uint32_t l1, h1;
814
815 pixels += line_size;
816 for (i = 0; i < h; i += 2) {
817 uint32_t a = AV_RN32(pixels);
818 uint32_t b = AV_RN32(pixels + 1);
819 l1 = (a & 0x03030303UL) +
820 (b & 0x03030303UL);
821 h1 = ((a & 0xFCFCFCFCUL) >> 2) +
822 ((b & 0xFCFCFCFCUL) >> 2);
823 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
824 pixels += line_size;
825 block += line_size;
826 a = AV_RN32(pixels);
827 b = AV_RN32(pixels + 1);
828 l0 = (a & 0x03030303UL) +
829 (b & 0x03030303UL) +
830 0x02020202UL;
831 h0 = ((a & 0xFCFCFCFCUL) >> 2) +
832 ((b & 0xFCFCFCFCUL) >> 2);
833 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
834 pixels += line_size;
835 block += line_size;
836 }
837 }
838
ff_put_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)839 void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
840 ptrdiff_t line_size, int h)
841 {
842 #if 1
843 double ftmp[10];
844 mips_reg addr[2];
845 DECLARE_VAR_ALL64;
846 DECLARE_VAR_ADDRT;
847
848 __asm__ volatile (
849 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
850 "dli %[addr0], 0x0f \n\t"
851 "pcmpeqw %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
852 "dmtc1 %[addr0], %[ftmp8] \n\t"
853 "dli %[addr0], 0x01 \n\t"
854 "psrlh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
855 "dmtc1 %[addr0], %[ftmp8] \n\t"
856 "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
857
858 "dli %[addr0], 0x02 \n\t"
859 "dmtc1 %[addr0], %[ftmp9] \n\t"
860 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
861 MMI_ULDC1(%[ftmp4], %[pixels], 0x01)
862 "mov.d %[ftmp1], %[ftmp0] \n\t"
863 "mov.d %[ftmp5], %[ftmp4] \n\t"
864 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
865 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
866 "punpckhbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
867 "punpckhbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
868 "paddush %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
869 "paddush %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
870 "xor %[addr0], %[addr0], %[addr0] \n\t"
871 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
872 ".p2align 3 \n\t"
873
874 "1: \n\t"
875 PTR_ADDU "%[addr1], %[pixels], %[addr0] \n\t"
876 MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
877 MMI_ULDC1(%[ftmp2], %[addr1], 0x01)
878 "mov.d %[ftmp1], %[ftmp0] \n\t"
879 "mov.d %[ftmp3], %[ftmp2] \n\t"
880 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
881 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
882 "punpckhbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
883 "punpckhbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
884 "paddush %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
885 "paddush %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
886 "paddush %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
887 "paddush %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
888 "paddush %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
889 "paddush %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
890 "psrlh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
891 "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
892 "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
893 MMI_SDXC1(%[ftmp4], %[block], %[addr0], 0x00)
894 PTR_ADDU "%[addr0], %[addr0], %[line_size] \n\t"
895 PTR_ADDU "%[addr1], %[pixels], %[addr0] \n\t"
896 MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
897 MMI_ULDC1(%[ftmp4], %[addr1], 0x01)
898 "mov.d %[ftmp3], %[ftmp2] \n\t"
899 "mov.d %[ftmp5], %[ftmp4] \n\t"
900 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
901 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
902 "punpckhbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
903 "punpckhbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
904 "paddush %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
905 "paddush %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
906 "paddush %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
907 "paddush %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
908 "paddush %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
909 "paddush %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
910 "psrlh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
911 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
912 "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
913 MMI_SDXC1(%[ftmp0], %[block], %[addr0], 0x00)
914 PTR_ADDU "%[addr0], %[addr0], %[line_size] \n\t"
915 PTR_ADDU "%[h], %[h], -0x02 \n\t"
916 "bnez %[h], 1b \n\t"
917 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
918 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
919 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
920 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
921 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
922 RESTRICT_ASM_ALL64
923 RESTRICT_ASM_ADDRT
924 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
925 [h]"+&r"(h), [pixels]"+&r"(pixels)
926 : [block]"r"(block), [line_size]"r"((mips_reg)line_size)
927 : "memory"
928 );
929 #else
930 /* FIXME HIGH BIT DEPTH */
931 int j;
932
933 for (j = 0; j < 2; j++) {
934 int i;
935 const uint32_t a = AV_RN32(pixels);
936 const uint32_t b = AV_RN32(pixels + 1);
937 uint32_t l0 = (a & 0x03030303UL) +
938 (b & 0x03030303UL) +
939 0x02020202UL;
940 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
941 ((b & 0xFCFCFCFCUL) >> 2);
942 uint32_t l1, h1;
943
944 pixels += line_size;
945 for (i = 0; i < h; i += 2) {
946 uint32_t a = AV_RN32(pixels);
947 uint32_t b = AV_RN32(pixels + 1);
948 l1 = (a & 0x03030303UL) +
949 (b & 0x03030303UL);
950 h1 = ((a & 0xFCFCFCFCUL) >> 2) +
951 ((b & 0xFCFCFCFCUL) >> 2);
952 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
953 pixels += line_size;
954 block += line_size;
955 a = AV_RN32(pixels);
956 b = AV_RN32(pixels + 1);
957 l0 = (a & 0x03030303UL) +
958 (b & 0x03030303UL) +
959 0x02020202UL;
960 h0 = ((a & 0xFCFCFCFCUL) >> 2) +
961 ((b & 0xFCFCFCFCUL) >> 2);
962 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
963 pixels += line_size;
964 block += line_size;
965 }
966 pixels += 4 - line_size * (h + 1);
967 block += 4 - line_size * h;
968 }
969 #endif
970 }
971
ff_put_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)972 void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
973 ptrdiff_t line_size, int h)
974 {
975 ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h);
976 ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
977 }
978
ff_avg_pixels4_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)979 void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
980 ptrdiff_t line_size, int h)
981 {
982 /* FIXME HIGH BIT DEPTH */
983 int i;
984 const uint32_t a = AV_RN32(pixels);
985 const uint32_t b = AV_RN32(pixels + 1);
986 uint32_t l0 = (a & 0x03030303UL) +
987 (b & 0x03030303UL) +
988 0x02020202UL;
989 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
990 ((b & 0xFCFCFCFCUL) >> 2);
991 uint32_t l1, h1;
992
993 pixels += line_size;
994 for (i = 0; i < h; i += 2) {
995 uint32_t a = AV_RN32(pixels);
996 uint32_t b = AV_RN32(pixels + 1);
997 l1 = (a & 0x03030303UL) +
998 (b & 0x03030303UL);
999 h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1000 ((b & 0xFCFCFCFCUL) >> 2);
1001 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1002 pixels += line_size;
1003 block += line_size;
1004 a = AV_RN32(pixels);
1005 b = AV_RN32(pixels + 1);
1006 l0 = (a & 0x03030303UL) +
1007 (b & 0x03030303UL) +
1008 0x02020202UL;
1009 h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1010 ((b & 0xFCFCFCFCUL) >> 2);
1011 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1012 pixels += line_size;
1013 block += line_size;
1014 }
1015 }
1016
ff_avg_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1017 void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1018 ptrdiff_t line_size, int h)
1019 {
1020 /* FIXME HIGH BIT DEPTH */
1021 int j;
1022
1023 for (j = 0; j < 2; j++) {
1024 int i;
1025 const uint32_t a = AV_RN32(pixels);
1026 const uint32_t b = AV_RN32(pixels + 1);
1027 uint32_t l0 = (a & 0x03030303UL) +
1028 (b & 0x03030303UL) +
1029 0x02020202UL;
1030 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1031 ((b & 0xFCFCFCFCUL) >> 2);
1032 uint32_t l1, h1;
1033
1034 pixels += line_size;
1035 for (i = 0; i < h; i += 2) {
1036 uint32_t a = AV_RN32(pixels);
1037 uint32_t b = AV_RN32(pixels + 1);
1038 l1 = (a & 0x03030303UL) +
1039 (b & 0x03030303UL);
1040 h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1041 ((b & 0xFCFCFCFCUL) >> 2);
1042 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1043 pixels += line_size;
1044 block += line_size;
1045 a = AV_RN32(pixels);
1046 b = AV_RN32(pixels + 1);
1047 l0 = (a & 0x03030303UL) +
1048 (b & 0x03030303UL) +
1049 0x02020202UL;
1050 h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1051 ((b & 0xFCFCFCFCUL) >> 2);
1052 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1053 pixels += line_size;
1054 block += line_size;
1055 }
1056 pixels += 4 - line_size * (h + 1);
1057 block += 4 - line_size * h;
1058 }
1059 }
1060
ff_avg_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1061 void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1062 ptrdiff_t line_size, int h)
1063 {
1064 ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1065 ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1066 }
1067
ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1068 void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1069 ptrdiff_t line_size, int h)
1070 {
1071 /* FIXME HIGH BIT DEPTH */
1072 int j;
1073
1074 for (j = 0; j < 2; j++) {
1075 int i;
1076 const uint32_t a = AV_RN32(pixels);
1077 const uint32_t b = AV_RN32(pixels + 1);
1078 uint32_t l0 = (a & 0x03030303UL) +
1079 (b & 0x03030303UL) +
1080 0x01010101UL;
1081 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1082 ((b & 0xFCFCFCFCUL) >> 2);
1083 uint32_t l1, h1;
1084
1085 pixels += line_size;
1086 for (i = 0; i < h; i += 2) {
1087 uint32_t a = AV_RN32(pixels);
1088 uint32_t b = AV_RN32(pixels + 1);
1089 l1 = (a & 0x03030303UL) +
1090 (b & 0x03030303UL);
1091 h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1092 ((b & 0xFCFCFCFCUL) >> 2);
1093 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1094 pixels += line_size;
1095 block += line_size;
1096 a = AV_RN32(pixels);
1097 b = AV_RN32(pixels + 1);
1098 l0 = (a & 0x03030303UL) +
1099 (b & 0x03030303UL) +
1100 0x01010101UL;
1101 h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1102 ((b & 0xFCFCFCFCUL) >> 2);
1103 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1104 pixels += line_size;
1105 block += line_size;
1106 }
1107 pixels += 4 - line_size * (h + 1);
1108 block += 4 - line_size * h;
1109 }
1110 }
1111
ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1112 void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1113 ptrdiff_t line_size, int h)
1114 {
1115 ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1116 ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1117 }
1118