1 /*
2 * Loongson SIMD optimized qpeldsp
3 *
4 * Copyright (c) 2016 Loongson Technology Corporation Limited
5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "hpeldsp_mips.h"
25 #include "libavcodec/bit_depth_template.c"
26 #include "libavutil/mips/mmiutils.h"
27 #include "constants.h"
28
ff_put_pixels4_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)29 void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
30 ptrdiff_t line_size, int h)
31 {
32 double ftmp[4];
33 DECLARE_VAR_LOW32;
34
35 __asm__ volatile (
36 "1: \n\t"
37 MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
38 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
39 MMI_ULWC1(%[ftmp1], %[pixels], 0x00)
40 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
41
42 PTR_ADDI "%[h], %[h], -0x02 \n\t"
43
44 MMI_SWC1(%[ftmp0], %[block], 0x00)
45 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
46 MMI_SWC1(%[ftmp1], %[block], 0x00)
47 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
48
49 "bnez %[h], 1b \n\t"
50 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
51 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
52 RESTRICT_ASM_LOW32
53 [block]"+&r"(block), [pixels]"+&r"(pixels),
54 [h]"+&r"(h)
55 : [line_size]"r"((mips_reg)line_size)
56 : "memory"
57 );
58 }
59
ff_put_pixels8_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)60 void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
61 ptrdiff_t line_size, int h)
62 {
63 double ftmp[4];
64 DECLARE_VAR_ALL64;
65
66 __asm__ volatile (
67 "1: \n\t"
68 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
69 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
70 MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
71 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
72 MMI_ULDC1(%[ftmp2], %[pixels], 0x00)
73 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
74 MMI_ULDC1(%[ftmp3], %[pixels], 0x00)
75 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
76
77 PTR_ADDI "%[h], %[h], -0x04 \n\t"
78
79 MMI_SDC1(%[ftmp0], %[block], 0x00)
80 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
81 MMI_SDC1(%[ftmp1], %[block], 0x00)
82 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
83 MMI_SDC1(%[ftmp2], %[block], 0x00)
84 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
85 MMI_SDC1(%[ftmp3], %[block], 0x00)
86 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
87
88 "bnez %[h], 1b \n\t"
89 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
90 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
91 RESTRICT_ASM_ALL64
92 [block]"+&r"(block), [pixels]"+&r"(pixels),
93 [h]"+&r"(h)
94 : [line_size]"r"((mips_reg)line_size)
95 : "memory"
96 );
97 }
98
ff_put_pixels16_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)99 void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
100 ptrdiff_t line_size, int h)
101 {
102 double ftmp[8];
103 DECLARE_VAR_ALL64;
104
105 __asm__ volatile (
106 "1: \n\t"
107 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
108 MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
109 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
110 MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
111 MMI_ULDC1(%[ftmp3], %[pixels], 0x08)
112 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
113 MMI_ULDC1(%[ftmp4], %[pixels], 0x00)
114 MMI_ULDC1(%[ftmp6], %[pixels], 0x08)
115 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
116 MMI_ULDC1(%[ftmp5], %[pixels], 0x00)
117 MMI_ULDC1(%[ftmp7], %[pixels], 0x08)
118 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
119
120 PTR_ADDI "%[h], %[h], -0x04 \n\t"
121
122 MMI_SDC1(%[ftmp0], %[block], 0x00)
123 MMI_SDC1(%[ftmp2], %[block], 0x08)
124 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
125 MMI_SDC1(%[ftmp1], %[block], 0x00)
126 MMI_SDC1(%[ftmp3], %[block], 0x08)
127 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
128 MMI_SDC1(%[ftmp4], %[block], 0x00)
129 MMI_SDC1(%[ftmp6], %[block], 0x08)
130 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
131 MMI_SDC1(%[ftmp5], %[block], 0x00)
132 MMI_SDC1(%[ftmp7], %[block], 0x08)
133 PTR_ADDU "%[block], %[block], %[line_size] \n\t"
134
135 "bnez %[h], 1b \n\t"
136 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
137 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
138 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
139 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
140 RESTRICT_ASM_ALL64
141 [block]"+&r"(block), [pixels]"+&r"(pixels),
142 [h]"+&r"(h)
143 : [line_size]"r"((mips_reg)line_size)
144 : "memory"
145 );
146 }
147
ff_avg_pixels4_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)148 void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
149 ptrdiff_t line_size, int h)
150 {
151 double ftmp[4];
152 mips_reg addr[2];
153 DECLARE_VAR_LOW32;
154
155 __asm__ volatile (
156 "1: \n\t"
157 PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
158 MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
159 MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
160 PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
161 MMI_ULWC1(%[ftmp2], %[block], 0x00)
162 MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
163
164 PTR_ADDI "%[h], %[h], -0x02 \n\t"
165
166 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
167 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
168 MMI_SWC1(%[ftmp0], %[block], 0x00)
169 MMI_SWC1(%[ftmp1], %[addr1], 0x00)
170 PTR_ADDU "%[pixels], %[addr0], %[line_size] \n\t"
171 PTR_ADDU "%[block], %[addr1], %[line_size] \n\t"
172
173 "bnez %[h], 1b \n\t"
174 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
175 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
176 RESTRICT_ASM_LOW32
177 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
178 [block]"+&r"(block), [pixels]"+&r"(pixels),
179 [h]"+&r"(h)
180 : [line_size]"r"((mips_reg)line_size)
181 : "memory"
182 );
183 }
184
ff_avg_pixels8_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)185 void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
186 ptrdiff_t line_size, int h)
187 {
188 double ftmp[4];
189 mips_reg addr[3];
190 DECLARE_VAR_ALL64;
191 DECLARE_VAR_ADDRT;
192
193 __asm__ volatile (
194 PTR_ADDU "%[addr2], %[line_size], %[line_size] \n\t"
195 "1: \n\t"
196 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
197 PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
198 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
199 PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
200 MMI_ULDC1(%[ftmp2], %[block], 0x00)
201 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
202 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
203 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
204 MMI_SDC1(%[ftmp0], %[block], 0x00)
205 MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
206 PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t"
207 PTR_ADDU "%[block], %[block], %[addr2] \n\t"
208
209 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
210 PTR_ADDU "%[addr0], %[pixels], %[line_size] \n\t"
211 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
212 PTR_ADDU "%[addr1], %[block], %[line_size] \n\t"
213 MMI_ULDC1(%[ftmp2], %[block], 0x00)
214 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
215 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
216 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
217 MMI_SDC1(%[ftmp0], %[block], 0x00)
218 MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
219 PTR_ADDU "%[pixels], %[pixels], %[addr2] \n\t"
220 PTR_ADDU "%[block], %[block], %[addr2] \n\t"
221
222 PTR_ADDI "%[h], %[h], -0x04 \n\t"
223 "bnez %[h], 1b \n\t"
224 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
225 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
226 RESTRICT_ASM_ALL64
227 RESTRICT_ASM_ADDRT
228 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
229 [addr2]"=&r"(addr[2]),
230 [block]"+&r"(block), [pixels]"+&r"(pixels),
231 [h]"+&r"(h)
232 : [line_size]"r"((mips_reg)line_size)
233 : "memory"
234 );
235 }
236
ff_avg_pixels16_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)237 void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
238 ptrdiff_t line_size, int h)
239 {
240 double ftmp[8];
241 mips_reg addr[1];
242 DECLARE_VAR_ALL64;
243
244 __asm__ volatile (
245 "1: \n\t"
246 PTR_ADDI "%[h], %[h], -0x04 \n\t"
247 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
248 MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
249 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
250 MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
251 MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
252 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
253 MMI_ULDC1(%[ftmp2], %[block], 0x00)
254 MMI_ULDC1(%[ftmp6], %[block], 0x08)
255 PTR_ADDU "%[addr0], %[block], %[line_size] \n\t"
256 MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
257 MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
258 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
259 "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
260 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
261 "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
262 MMI_SDC1(%[ftmp0], %[block], 0x00)
263 MMI_SDC1(%[ftmp4], %[block], 0x08)
264 MMI_SDC1(%[ftmp1], %[addr0], 0x00)
265 MMI_SDC1(%[ftmp5], %[addr0], 0x08)
266 PTR_ADDU "%[block], %[addr0], %[line_size] \n\t"
267
268 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
269 MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
270 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
271 MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
272 MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
273 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
274 MMI_ULDC1(%[ftmp2], %[block], 0x00)
275 MMI_ULDC1(%[ftmp6], %[block], 0x08)
276 PTR_ADDU "%[addr0], %[block], %[line_size] \n\t"
277 MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
278 MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
279 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
280 "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
281 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
282 "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
283 MMI_SDC1(%[ftmp0], %[block], 0x00)
284 MMI_SDC1(%[ftmp4], %[block], 0x08)
285 MMI_SDC1(%[ftmp1], %[addr0], 0x00)
286 MMI_SDC1(%[ftmp5], %[addr0], 0x08)
287 PTR_ADDU "%[block], %[addr0], %[line_size] \n\t"
288
289 "bnez %[h], 1b \n\t"
290 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
291 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
292 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
293 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
294 RESTRICT_ASM_ALL64
295 [addr0]"=&r"(addr[0]),
296 [block]"+&r"(block), [pixels]"+&r"(pixels),
297 [h]"+&r"(h)
298 : [line_size]"r"((mips_reg)line_size)
299 : "memory"
300 );
301 }
302
ff_put_pixels4_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)303 inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
304 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
305 int h)
306 {
307 double ftmp[4];
308 mips_reg addr[5];
309 DECLARE_VAR_LOW32;
310 DECLARE_VAR_ADDRT;
311
312 __asm__ volatile (
313 "1: \n\t"
314 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
315 MMI_ULWC1(%[ftmp0], %[src1], 0x00)
316 MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
317 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
318 MMI_ULWC1(%[ftmp2], %[src2], 0x00)
319 MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
320 PTR_ADDU "%[src1], %[addr0], %[src_stride1] \n\t"
321 PTR_ADDU "%[src2], %[addr1], %[src_stride2] \n\t"
322
323 PTR_ADDI "%[h], %[h], -0x02 \n\t"
324
325 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
326 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
327 MMI_SWC1(%[ftmp0], %[dst], 0x00)
328 PTR_ADDU "%[dst], %[dst], %[dst_stride] \n\t"
329 MMI_SWC1(%[ftmp1], %[dst], 0x00)
330 PTR_ADDU "%[dst], %[dst], %[dst_stride] \n\t"
331
332 "bnez %[h], 1b \n\t"
333 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
334 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
335 RESTRICT_ASM_LOW32
336 RESTRICT_ASM_ADDRT
337 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
338 [dst]"+&r"(dst), [src1]"+&r"(src1),
339 [src2]"+&r"(src2), [h]"+&r"(h)
340 : [dst_stride]"r"((mips_reg)dst_stride),
341 [src_stride1]"r"((mips_reg)src_stride1),
342 [src_stride2]"r"((mips_reg)src_stride2)
343 : "memory"
344 );
345 }
346
ff_put_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)347 inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
348 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
349 int h)
350 {
351 double ftmp[4];
352 mips_reg addr[5];
353 DECLARE_VAR_ALL64;
354 DECLARE_VAR_ADDRT;
355
356 __asm__ volatile (
357 PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
358 PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
359 PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
360
361 "1: \n\t"
362 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
363 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
364 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
365 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
366 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
367 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
368 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
369 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
370 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
371 MMI_SDC1(%[ftmp0], %[dst], 0x00)
372 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
373 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
374 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
375
376 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
377 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
378 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
379 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
380 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
381 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
382 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
383 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
384 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
385 MMI_SDC1(%[ftmp0], %[dst], 0x00)
386 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
387 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
388 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
389
390 PTR_ADDI "%[h], %[h], -0x04 \n\t"
391 "bnez %[h], 1b \n\t"
392 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
393 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
394 RESTRICT_ASM_ALL64
395 RESTRICT_ASM_ADDRT
396 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
397 [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
398 [addr4]"=&r"(addr[4]),
399 [dst]"+&r"(dst), [src1]"+&r"(src1),
400 [src2]"+&r"(src2), [h]"+&r"(h)
401 : [dst_stride]"r"((mips_reg)dst_stride),
402 [src_stride1]"r"((mips_reg)src_stride1),
403 [src_stride2]"r"((mips_reg)src_stride2)
404 : "memory"
405 );
406 }
407
ff_put_pixels16_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)408 inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
409 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
410 int h)
411 {
412 double ftmp[8];
413 mips_reg addr[5];
414 DECLARE_VAR_ALL64;
415 DECLARE_VAR_ADDRT;
416
417 __asm__ volatile (
418 PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
419 PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
420 PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
421
422 "1: \n\t"
423 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
424 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
425 MMI_ULDC1(%[ftmp4], %[src1], 0x08)
426 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
427 MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
428 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
429 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
430 MMI_ULDC1(%[ftmp6], %[src2], 0x08)
431 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
432 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
433 MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
434 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
435 "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
436 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
437 "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
438 MMI_SDC1(%[ftmp0], %[dst], 0x00)
439 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
440 MMI_SDC1(%[ftmp4], %[dst], 0x08)
441 MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
442 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
443 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
444
445 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
446 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
447 MMI_ULDC1(%[ftmp4], %[src1], 0x08)
448 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
449 MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
450 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
451 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
452 MMI_ULDC1(%[ftmp6], %[src2], 0x08)
453 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
454 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
455 MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
456 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
457 "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
458 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
459 "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
460 MMI_SDC1(%[ftmp0], %[dst], 0x00)
461 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
462 MMI_SDC1(%[ftmp4], %[dst], 0x08)
463 MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
464 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
465 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
466
467 PTR_ADDI "%[h], %[h], -0x04 \n\t"
468 "bnez %[h], 1b \n\t"
469 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
470 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
471 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
472 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
473 RESTRICT_ASM_ALL64
474 RESTRICT_ASM_ADDRT
475 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
476 [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
477 [addr4]"=&r"(addr[4]),
478 [dst]"+&r"(dst), [src1]"+&r"(src1),
479 [src2]"+&r"(src2), [h]"+&r"(h)
480 : [dst_stride]"r"((mips_reg)dst_stride),
481 [src_stride1]"r"((mips_reg)src_stride1),
482 [src_stride2]"r"((mips_reg)src_stride2)
483 : "memory"
484 );
485 }
486
ff_avg_pixels4_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)487 inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
488 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
489 int h)
490 {
491 double ftmp[6];
492 mips_reg addr[6];
493 DECLARE_VAR_LOW32;
494
495 __asm__ volatile (
496 "1: \n\t"
497 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
498 MMI_ULWC1(%[ftmp0], %[src1], 0x00)
499 MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
500 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
501 MMI_ULWC1(%[ftmp2], %[src2], 0x00)
502 MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
503 PTR_ADDU "%[src1], %[addr0], %[src_stride1] \n\t"
504 PTR_ADDU "%[src2], %[addr1], %[src_stride2] \n\t"
505 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
506 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
507 PTR_ADDU "%[addr2], %[dst], %[dst_stride] \n\t"
508 MMI_ULWC1(%[ftmp4], %[dst], 0x00)
509 MMI_ULWC1(%[ftmp5], %[addr2], 0x00)
510 PTR_ADDI "%[h], %[h], -0x02 \n\t"
511 "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
512 "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
513 MMI_SWC1(%[ftmp0], %[dst], 0x00)
514 MMI_SWC1(%[ftmp1], %[addr2], 0x00)
515 PTR_ADDU "%[dst], %[addr2], %[dst_stride] \n\t"
516
517 "bnez %[h], 1b \n\t"
518 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
519 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
520 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
521 RESTRICT_ASM_LOW32
522 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
523 [addr2]"=&r"(addr[2]),
524 [dst]"+&r"(dst), [src1]"+&r"(src1),
525 [src2]"+&r"(src2), [h]"+&r"(h)
526 : [dst_stride]"r"((mips_reg)dst_stride),
527 [src_stride1]"r"((mips_reg)src_stride1),
528 [src_stride2]"r"((mips_reg)src_stride2)
529 : "memory"
530 );
531 }
532
ff_avg_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)533 inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
534 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
535 int h)
536 {
537 double ftmp[6];
538 mips_reg addr[6];
539 DECLARE_VAR_ALL64;
540 DECLARE_VAR_ADDRT;
541
542 __asm__ volatile (
543 PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
544 PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
545 PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
546
547 "1: \n\t"
548 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
549 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
550 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
551 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
552 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
553 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
554 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
555 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
556 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
557 PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t"
558 MMI_ULDC1(%[ftmp4], %[dst], 0x00)
559 MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
560 "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
561 "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
562 MMI_SDC1(%[ftmp0], %[dst], 0x00)
563 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
564 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
565 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
566
567 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
568 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
569 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
570 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
571 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
572 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
573 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
574 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
575 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
576 PTR_ADDU "%[addr5], %[dst], %[dst_stride] \n\t"
577 MMI_ULDC1(%[ftmp4], %[dst], 0x00)
578 MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
579 "pavgb %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
580 "pavgb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
581 MMI_SDC1(%[ftmp0], %[dst], 0x00)
582 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
583 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
584 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
585
586 PTR_ADDI "%[h], %[h], -0x04 \n\t"
587 "bnez %[h], 1b \n\t"
588 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
589 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
590 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
591 RESTRICT_ASM_ALL64
592 RESTRICT_ASM_ADDRT
593 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
594 [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
595 [addr4]"=&r"(addr[4]), [addr5]"=&r"(addr[5]),
596 [dst]"+&r"(dst), [src1]"+&r"(src1),
597 [src2]"+&r"(src2), [h]"+&r"(h)
598 : [dst_stride]"r"((mips_reg)dst_stride),
599 [src_stride1]"r"((mips_reg)src_stride1),
600 [src_stride2]"r"((mips_reg)src_stride2)
601 : "memory"
602 );
603 }
604
ff_avg_pixels16_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)605 inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
606 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
607 int h)
608 {
609 ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1,
610 src_stride2, h);
611 ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride,
612 src_stride1, src_stride2, h);
613 }
614
ff_put_pixels4_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)615 void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
616 ptrdiff_t line_size, int h)
617 {
618 ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
619 line_size, h);
620 }
621
ff_put_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)622 void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
623 ptrdiff_t line_size, int h)
624 {
625 ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
626 line_size, h);
627 }
628
ff_put_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)629 void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
630 ptrdiff_t line_size, int h)
631 {
632 ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
633 line_size, h);
634 }
635
ff_avg_pixels4_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)636 void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
637 ptrdiff_t line_size, int h)
638 {
639 ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
640 line_size, h);
641 }
642
ff_avg_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)643 void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
644 ptrdiff_t line_size, int h)
645 {
646 ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
647 line_size, h);
648 }
649
ff_avg_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)650 void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
651 ptrdiff_t line_size, int h)
652 {
653 ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h);
654 ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
655 }
656
ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)657 inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
658 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
659 int h)
660 {
661 double ftmp[5];
662 mips_reg addr[5];
663 DECLARE_VAR_ALL64;
664 DECLARE_VAR_ADDRT;
665
666 __asm__ volatile (
667 "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
668 PTR_ADDU "%[addr2], %[src_stride1], %[src_stride1] \n\t"
669 PTR_ADDU "%[addr3], %[src_stride2], %[src_stride2] \n\t"
670 PTR_ADDU "%[addr4], %[dst_stride], %[dst_stride] \n\t"
671
672 "1: \n\t"
673 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
674 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
675 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
676 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
677 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
678 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
679 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
680 "pxor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
681 "pxor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
682 "pxor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
683 "pxor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
684 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
685 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
686 "pxor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
687 "pxor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
688 MMI_SDC1(%[ftmp0], %[dst], 0x00)
689 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
690 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
691 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
692
693 MMI_ULDC1(%[ftmp0], %[src1], 0x00)
694 PTR_ADDU "%[addr0], %[src1], %[src_stride1] \n\t"
695 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
696 MMI_ULDC1(%[ftmp2], %[src2], 0x00)
697 PTR_ADDU "%[addr1], %[src2], %[src_stride2] \n\t"
698 MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
699 PTR_ADDU "%[src1], %[src1], %[addr2] \n\t"
700 "pxor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
701 "pxor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
702 "pxor %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
703 "pxor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
704 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
705 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
706 "pxor %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
707 "pxor %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
708 MMI_SDC1(%[ftmp0], %[dst], 0x00)
709 MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
710 PTR_ADDU "%[src2], %[src2], %[addr3] \n\t"
711 PTR_ADDU "%[dst], %[dst], %[addr4] \n\t"
712
713 PTR_ADDI "%[h], %[h], -0x04 \n\t"
714 "bnez %[h], 1b \n\t"
715 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
716 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
717 [ftmp4]"=&f"(ftmp[4]),
718 RESTRICT_ASM_ALL64
719 RESTRICT_ASM_ADDRT
720 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
721 [addr2]"=&r"(addr[2]), [addr3]"=&r"(addr[3]),
722 [addr4]"=&r"(addr[4]),
723 [dst]"+&r"(dst), [src1]"+&r"(src1),
724 [src2]"+&r"(src2), [h]"+&r"(h)
725 : [dst_stride]"r"((mips_reg)dst_stride),
726 [src_stride1]"r"((mips_reg)src_stride1),
727 [src_stride2]"r"((mips_reg)src_stride2)
728 : "memory"
729 );
730 }
731
ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)732 void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
733 ptrdiff_t line_size, int h)
734 {
735 ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size,
736 line_size, line_size, h);
737 }
738
ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)739 void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
740 ptrdiff_t line_size, int h)
741 {
742 ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h);
743 ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
744 }
745
ff_put_pixels4_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)746 void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
747 ptrdiff_t line_size, int h)
748 {
749 ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
750 line_size, line_size, h);
751 }
752
ff_put_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)753 void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
754 ptrdiff_t line_size, int h)
755 {
756 ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
757 line_size, line_size, h);
758 }
759
ff_put_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)760 void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
761 ptrdiff_t line_size, int h)
762 {
763 ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size,
764 line_size, line_size, h);
765 }
766
ff_avg_pixels4_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)767 void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
768 ptrdiff_t line_size, int h)
769 {
770 ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
771 line_size, line_size, h);
772 }
773
ff_avg_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)774 void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
775 ptrdiff_t line_size, int h)
776 {
777 ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
778 line_size, line_size, h);
779 }
780
ff_avg_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)781 void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
782 ptrdiff_t line_size, int h)
783 {
784 ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h);
785 ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h);
786 }
787
ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)788 void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
789 ptrdiff_t line_size, int h)
790 {
791 ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size,
792 line_size, line_size, line_size, h);
793 }
794
ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)795 void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
796 ptrdiff_t line_size, int h)
797 {
798 ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h);
799 ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h);
800 }
801
ff_put_pixels4_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)802 void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
803 ptrdiff_t line_size, int h)
804 {
805 /* FIXME HIGH BIT DEPTH */
806 int i;
807 const uint32_t a = AV_RN32(pixels);
808 const uint32_t b = AV_RN32(pixels + 1);
809 uint32_t l0 = (a & 0x03030303UL) +
810 (b & 0x03030303UL) +
811 0x02020202UL;
812 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
813 ((b & 0xFCFCFCFCUL) >> 2);
814 uint32_t l1, h1;
815
816 pixels += line_size;
817 for (i = 0; i < h; i += 2) {
818 uint32_t a = AV_RN32(pixels);
819 uint32_t b = AV_RN32(pixels + 1);
820 l1 = (a & 0x03030303UL) +
821 (b & 0x03030303UL);
822 h1 = ((a & 0xFCFCFCFCUL) >> 2) +
823 ((b & 0xFCFCFCFCUL) >> 2);
824 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
825 pixels += line_size;
826 block += line_size;
827 a = AV_RN32(pixels);
828 b = AV_RN32(pixels + 1);
829 l0 = (a & 0x03030303UL) +
830 (b & 0x03030303UL) +
831 0x02020202UL;
832 h0 = ((a & 0xFCFCFCFCUL) >> 2) +
833 ((b & 0xFCFCFCFCUL) >> 2);
834 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
835 pixels += line_size;
836 block += line_size;
837 }
838 }
839
ff_put_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)840 void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
841 ptrdiff_t line_size, int h)
842 {
843 #if 1
844 double ftmp[10];
845 mips_reg addr[2];
846 DECLARE_VAR_ALL64;
847 DECLARE_VAR_ADDRT;
848
849 __asm__ volatile (
850 "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
851 "dli %[addr0], 0x0f \n\t"
852 "pcmpeqw %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
853 "dmtc1 %[addr0], %[ftmp8] \n\t"
854 "dli %[addr0], 0x01 \n\t"
855 "psrlh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
856 "dmtc1 %[addr0], %[ftmp8] \n\t"
857 "psllh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
858
859 "dli %[addr0], 0x02 \n\t"
860 "dmtc1 %[addr0], %[ftmp9] \n\t"
861 MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
862 MMI_ULDC1(%[ftmp4], %[pixels], 0x01)
863 "mov.d %[ftmp1], %[ftmp0] \n\t"
864 "mov.d %[ftmp5], %[ftmp4] \n\t"
865 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
866 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
867 "punpckhbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
868 "punpckhbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
869 "paddush %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
870 "paddush %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
871 "xor %[addr0], %[addr0], %[addr0] \n\t"
872 PTR_ADDU "%[pixels], %[pixels], %[line_size] \n\t"
873 ".p2align 3 \n\t"
874
875 "1: \n\t"
876 PTR_ADDU "%[addr1], %[pixels], %[addr0] \n\t"
877 MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
878 MMI_ULDC1(%[ftmp2], %[addr1], 0x01)
879 "mov.d %[ftmp1], %[ftmp0] \n\t"
880 "mov.d %[ftmp3], %[ftmp2] \n\t"
881 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
882 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
883 "punpckhbh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
884 "punpckhbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
885 "paddush %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
886 "paddush %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
887 "paddush %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
888 "paddush %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
889 "paddush %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
890 "paddush %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
891 "psrlh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
892 "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
893 "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
894 MMI_SDXC1(%[ftmp4], %[block], %[addr0], 0x00)
895 PTR_ADDU "%[addr0], %[addr0], %[line_size] \n\t"
896 PTR_ADDU "%[addr1], %[pixels], %[addr0] \n\t"
897 MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
898 MMI_ULDC1(%[ftmp4], %[addr1], 0x01)
899 "mov.d %[ftmp3], %[ftmp2] \n\t"
900 "mov.d %[ftmp5], %[ftmp4] \n\t"
901 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
902 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
903 "punpckhbh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
904 "punpckhbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
905 "paddush %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
906 "paddush %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
907 "paddush %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
908 "paddush %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
909 "paddush %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
910 "paddush %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
911 "psrlh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
912 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
913 "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
914 MMI_SDXC1(%[ftmp0], %[block], %[addr0], 0x00)
915 PTR_ADDU "%[addr0], %[addr0], %[line_size] \n\t"
916 PTR_ADDU "%[h], %[h], -0x02 \n\t"
917 "bnez %[h], 1b \n\t"
918 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
919 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
920 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
921 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
922 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
923 RESTRICT_ASM_ALL64
924 RESTRICT_ASM_ADDRT
925 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
926 [h]"+&r"(h), [pixels]"+&r"(pixels)
927 : [block]"r"(block), [line_size]"r"((mips_reg)line_size)
928 : "memory"
929 );
930 #else
931 /* FIXME HIGH BIT DEPTH */
932 int j;
933
934 for (j = 0; j < 2; j++) {
935 int i;
936 const uint32_t a = AV_RN32(pixels);
937 const uint32_t b = AV_RN32(pixels + 1);
938 uint32_t l0 = (a & 0x03030303UL) +
939 (b & 0x03030303UL) +
940 0x02020202UL;
941 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
942 ((b & 0xFCFCFCFCUL) >> 2);
943 uint32_t l1, h1;
944
945 pixels += line_size;
946 for (i = 0; i < h; i += 2) {
947 uint32_t a = AV_RN32(pixels);
948 uint32_t b = AV_RN32(pixels + 1);
949 l1 = (a & 0x03030303UL) +
950 (b & 0x03030303UL);
951 h1 = ((a & 0xFCFCFCFCUL) >> 2) +
952 ((b & 0xFCFCFCFCUL) >> 2);
953 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
954 pixels += line_size;
955 block += line_size;
956 a = AV_RN32(pixels);
957 b = AV_RN32(pixels + 1);
958 l0 = (a & 0x03030303UL) +
959 (b & 0x03030303UL) +
960 0x02020202UL;
961 h0 = ((a & 0xFCFCFCFCUL) >> 2) +
962 ((b & 0xFCFCFCFCUL) >> 2);
963 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
964 pixels += line_size;
965 block += line_size;
966 }
967 pixels += 4 - line_size * (h + 1);
968 block += 4 - line_size * h;
969 }
970 #endif
971 }
972
ff_put_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)973 void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
974 ptrdiff_t line_size, int h)
975 {
976 ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h);
977 ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
978 }
979
ff_avg_pixels4_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)980 void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
981 ptrdiff_t line_size, int h)
982 {
983 /* FIXME HIGH BIT DEPTH */
984 int i;
985 const uint32_t a = AV_RN32(pixels);
986 const uint32_t b = AV_RN32(pixels + 1);
987 uint32_t l0 = (a & 0x03030303UL) +
988 (b & 0x03030303UL) +
989 0x02020202UL;
990 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
991 ((b & 0xFCFCFCFCUL) >> 2);
992 uint32_t l1, h1;
993
994 pixels += line_size;
995 for (i = 0; i < h; i += 2) {
996 uint32_t a = AV_RN32(pixels);
997 uint32_t b = AV_RN32(pixels + 1);
998 l1 = (a & 0x03030303UL) +
999 (b & 0x03030303UL);
1000 h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1001 ((b & 0xFCFCFCFCUL) >> 2);
1002 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1003 pixels += line_size;
1004 block += line_size;
1005 a = AV_RN32(pixels);
1006 b = AV_RN32(pixels + 1);
1007 l0 = (a & 0x03030303UL) +
1008 (b & 0x03030303UL) +
1009 0x02020202UL;
1010 h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1011 ((b & 0xFCFCFCFCUL) >> 2);
1012 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1013 pixels += line_size;
1014 block += line_size;
1015 }
1016 }
1017
ff_avg_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1018 void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1019 ptrdiff_t line_size, int h)
1020 {
1021 /* FIXME HIGH BIT DEPTH */
1022 int j;
1023
1024 for (j = 0; j < 2; j++) {
1025 int i;
1026 const uint32_t a = AV_RN32(pixels);
1027 const uint32_t b = AV_RN32(pixels + 1);
1028 uint32_t l0 = (a & 0x03030303UL) +
1029 (b & 0x03030303UL) +
1030 0x02020202UL;
1031 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1032 ((b & 0xFCFCFCFCUL) >> 2);
1033 uint32_t l1, h1;
1034
1035 pixels += line_size;
1036 for (i = 0; i < h; i += 2) {
1037 uint32_t a = AV_RN32(pixels);
1038 uint32_t b = AV_RN32(pixels + 1);
1039 l1 = (a & 0x03030303UL) +
1040 (b & 0x03030303UL);
1041 h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1042 ((b & 0xFCFCFCFCUL) >> 2);
1043 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1044 pixels += line_size;
1045 block += line_size;
1046 a = AV_RN32(pixels);
1047 b = AV_RN32(pixels + 1);
1048 l0 = (a & 0x03030303UL) +
1049 (b & 0x03030303UL) +
1050 0x02020202UL;
1051 h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1052 ((b & 0xFCFCFCFCUL) >> 2);
1053 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1054 pixels += line_size;
1055 block += line_size;
1056 }
1057 pixels += 4 - line_size * (h + 1);
1058 block += 4 - line_size * h;
1059 }
1060 }
1061
ff_avg_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1062 void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1063 ptrdiff_t line_size, int h)
1064 {
1065 ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1066 ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1067 }
1068
ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1069 void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1070 ptrdiff_t line_size, int h)
1071 {
1072 /* FIXME HIGH BIT DEPTH */
1073 int j;
1074
1075 for (j = 0; j < 2; j++) {
1076 int i;
1077 const uint32_t a = AV_RN32(pixels);
1078 const uint32_t b = AV_RN32(pixels + 1);
1079 uint32_t l0 = (a & 0x03030303UL) +
1080 (b & 0x03030303UL) +
1081 0x01010101UL;
1082 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1083 ((b & 0xFCFCFCFCUL) >> 2);
1084 uint32_t l1, h1;
1085
1086 pixels += line_size;
1087 for (i = 0; i < h; i += 2) {
1088 uint32_t a = AV_RN32(pixels);
1089 uint32_t b = AV_RN32(pixels + 1);
1090 l1 = (a & 0x03030303UL) +
1091 (b & 0x03030303UL);
1092 h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1093 ((b & 0xFCFCFCFCUL) >> 2);
1094 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1095 pixels += line_size;
1096 block += line_size;
1097 a = AV_RN32(pixels);
1098 b = AV_RN32(pixels + 1);
1099 l0 = (a & 0x03030303UL) +
1100 (b & 0x03030303UL) +
1101 0x01010101UL;
1102 h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1103 ((b & 0xFCFCFCFCUL) >> 2);
1104 *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1105 pixels += line_size;
1106 block += line_size;
1107 }
1108 pixels += 4 - line_size * (h + 1);
1109 block += 4 - line_size * h;
1110 }
1111 }
1112
ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1113 void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1114 ptrdiff_t line_size, int h)
1115 {
1116 ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1117 ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1118 }
1119