• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Loongson SIMD optimized qpeldsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "hpeldsp_mips.h"
25 #include "libavcodec/bit_depth_template.c"
26 #include "libavutil/mips/mmiutils.h"
27 #include "constants.h"
28 
ff_put_pixels4_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)29 void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
30     ptrdiff_t line_size, int h)
31 {
32     double ftmp[4];
33     DECLARE_VAR_LOW32;
34 
35     __asm__ volatile (
36         "1:                                                             \n\t"
37         MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
38         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
39         MMI_ULWC1(%[ftmp1], %[pixels], 0x00)
40         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
41 
42         PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
43 
44         MMI_SWC1(%[ftmp0], %[block], 0x00)
45         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
46         MMI_SWC1(%[ftmp1], %[block], 0x00)
47         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
48 
49         "bnez       %[h],       1b                                      \n\t"
50         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
51           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
52           RESTRICT_ASM_LOW32
53           [block]"+&r"(block),              [pixels]"+&r"(pixels),
54           [h]"+&r"(h)
55         : [line_size]"r"((mips_reg)line_size)
56         : "memory"
57     );
58 }
59 
ff_put_pixels8_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)60 void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
61     ptrdiff_t line_size, int h)
62 {
63     double ftmp[4];
64     DECLARE_VAR_ALL64;
65 
66     __asm__ volatile (
67         "1:                                                             \n\t"
68         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
69         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
70         MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
71         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
72         MMI_ULDC1(%[ftmp2], %[pixels], 0x00)
73         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
74         MMI_ULDC1(%[ftmp3], %[pixels], 0x00)
75         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
76 
77         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
78 
79         MMI_SDC1(%[ftmp0], %[block], 0x00)
80         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
81         MMI_SDC1(%[ftmp1], %[block], 0x00)
82         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
83         MMI_SDC1(%[ftmp2], %[block], 0x00)
84         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
85         MMI_SDC1(%[ftmp3], %[block], 0x00)
86         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
87 
88         "bnez       %[h],       1b                                      \n\t"
89         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
90           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
91           RESTRICT_ASM_ALL64
92           [block]"+&r"(block),              [pixels]"+&r"(pixels),
93           [h]"+&r"(h)
94         : [line_size]"r"((mips_reg)line_size)
95         : "memory"
96     );
97 }
98 
ff_put_pixels16_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)99 void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
100     ptrdiff_t line_size, int h)
101 {
102     double ftmp[8];
103     DECLARE_VAR_ALL64;
104 
105     __asm__ volatile (
106         "1:                                                            \n\t"
107         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
108         MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
109         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
110         MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
111         MMI_ULDC1(%[ftmp3], %[pixels], 0x08)
112         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
113         MMI_ULDC1(%[ftmp4], %[pixels], 0x00)
114         MMI_ULDC1(%[ftmp6], %[pixels], 0x08)
115         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
116         MMI_ULDC1(%[ftmp5], %[pixels], 0x00)
117         MMI_ULDC1(%[ftmp7], %[pixels], 0x08)
118         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
119 
120         PTR_ADDI   "%[h],       %[h],           -0x04                  \n\t"
121 
122         MMI_SDC1(%[ftmp0], %[block], 0x00)
123         MMI_SDC1(%[ftmp2], %[block], 0x08)
124         PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
125         MMI_SDC1(%[ftmp1], %[block], 0x00)
126         MMI_SDC1(%[ftmp3], %[block], 0x08)
127         PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
128         MMI_SDC1(%[ftmp4], %[block], 0x00)
129         MMI_SDC1(%[ftmp6], %[block], 0x08)
130         PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
131         MMI_SDC1(%[ftmp5], %[block], 0x00)
132         MMI_SDC1(%[ftmp7], %[block], 0x08)
133         PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
134 
135         "bnez       %[h],       1b                                     \n\t"
136         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
137           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
138           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
139           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
140           RESTRICT_ASM_ALL64
141           [block]"+&r"(block),              [pixels]"+&r"(pixels),
142           [h]"+&r"(h)
143         : [line_size]"r"((mips_reg)line_size)
144         : "memory"
145     );
146 }
147 
ff_avg_pixels4_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)148 void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
149     ptrdiff_t line_size, int h)
150 {
151     double ftmp[4];
152     mips_reg addr[2];
153     DECLARE_VAR_LOW32;
154 
155     __asm__ volatile (
156         "1:                                                             \n\t"
157         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
158         MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
159         MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
160         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
161         MMI_ULWC1(%[ftmp2], %[block], 0x00)
162         MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
163 
164         PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
165 
166         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
167         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
168         MMI_SWC1(%[ftmp0], %[block], 0x00)
169         MMI_SWC1(%[ftmp1], %[addr1], 0x00)
170         PTR_ADDU   "%[pixels],  %[addr0],       %[line_size]            \n\t"
171         PTR_ADDU   "%[block],   %[addr1],       %[line_size]            \n\t"
172 
173         "bnez       %[h],       1b                                      \n\t"
174         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
175           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
176           RESTRICT_ASM_LOW32
177           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
178           [block]"+&r"(block),              [pixels]"+&r"(pixels),
179           [h]"+&r"(h)
180         : [line_size]"r"((mips_reg)line_size)
181         : "memory"
182     );
183 }
184 
ff_avg_pixels8_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)185 void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
186     ptrdiff_t line_size, int h)
187 {
188     double ftmp[4];
189     mips_reg addr[3];
190     DECLARE_VAR_ALL64;
191     DECLARE_VAR_ADDRT;
192 
193     __asm__ volatile (
194         PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
195         "1:                                                             \n\t"
196         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
197         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
198         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
199         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
200         MMI_ULDC1(%[ftmp2], %[block], 0x00)
201         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
202         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
203         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
204         MMI_SDC1(%[ftmp0], %[block], 0x00)
205         MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
206         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
207         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
208 
209         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
210         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
211         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
212         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
213         MMI_ULDC1(%[ftmp2], %[block], 0x00)
214         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
215         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
216         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
217         MMI_SDC1(%[ftmp0], %[block], 0x00)
218         MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
219         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
220         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
221 
222         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
223         "bnez       %[h],       1b                                      \n\t"
224         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
225           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
226           RESTRICT_ASM_ALL64
227           RESTRICT_ASM_ADDRT
228           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
229           [addr2]"=&r"(addr[2]),
230           [block]"+&r"(block),              [pixels]"+&r"(pixels),
231           [h]"+&r"(h)
232         : [line_size]"r"((mips_reg)line_size)
233         : "memory"
234     );
235 }
236 
ff_avg_pixels16_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)237 void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
238     ptrdiff_t line_size, int h)
239 {
240     double ftmp[8];
241     mips_reg addr[1];
242     DECLARE_VAR_ALL64;
243 
244     __asm__ volatile (
245         "1:                                                             \n\t"
246         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
247         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
248         MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
249         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
250         MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
251         MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
252         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
253         MMI_ULDC1(%[ftmp2], %[block], 0x00)
254         MMI_ULDC1(%[ftmp6], %[block], 0x08)
255         PTR_ADDU   "%[addr0],   %[block],       %[line_size]            \n\t"
256         MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
257         MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
258         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
259         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
260         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
261         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
262         MMI_SDC1(%[ftmp0], %[block], 0x00)
263         MMI_SDC1(%[ftmp4], %[block], 0x08)
264         MMI_SDC1(%[ftmp1], %[addr0], 0x00)
265         MMI_SDC1(%[ftmp5], %[addr0], 0x08)
266         PTR_ADDU   "%[block],   %[addr0],       %[line_size]            \n\t"
267 
268         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
269         MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
270         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
271         MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
272         MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
273         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
274         MMI_ULDC1(%[ftmp2], %[block], 0x00)
275         MMI_ULDC1(%[ftmp6], %[block], 0x08)
276         PTR_ADDU   "%[addr0],   %[block],       %[line_size]            \n\t"
277         MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
278         MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
279         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
280         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
281         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
282         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
283         MMI_SDC1(%[ftmp0], %[block], 0x00)
284         MMI_SDC1(%[ftmp4], %[block], 0x08)
285         MMI_SDC1(%[ftmp1], %[addr0], 0x00)
286         MMI_SDC1(%[ftmp5], %[addr0], 0x08)
287         PTR_ADDU   "%[block],   %[addr0],       %[line_size]            \n\t"
288 
289         "bnez       %[h],       1b                                      \n\t"
290         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
291           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
292           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
293           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
294           RESTRICT_ASM_ALL64
295           [addr0]"=&r"(addr[0]),
296           [block]"+&r"(block),              [pixels]"+&r"(pixels),
297           [h]"+&r"(h)
298         : [line_size]"r"((mips_reg)line_size)
299         : "memory"
300     );
301 }
302 
ff_put_pixels4_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)303 inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
304     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
305     int h)
306 {
307     double ftmp[4];
308     mips_reg addr[5];
309     DECLARE_VAR_LOW32;
310 
311     __asm__ volatile (
312         "1:                                                             \n\t"
313         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
314         MMI_ULWC1(%[ftmp0], %[src1], 0x00)
315         MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
316         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
317         MMI_ULWC1(%[ftmp2], %[src2], 0x00)
318         MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
319         PTR_ADDU   "%[src1],    %[addr0],       %[src_stride1]          \n\t"
320         PTR_ADDU   "%[src2],    %[addr1],       %[src_stride2]          \n\t"
321 
322         PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
323 
324         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
325         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
326         MMI_SWC1(%[ftmp0], %[dst], 0x00)
327         PTR_ADDU   "%[dst],     %[dst],         %[dst_stride]           \n\t"
328         MMI_SWC1(%[ftmp1], %[dst], 0x00)
329         PTR_ADDU   "%[dst],     %[dst],         %[dst_stride]           \n\t"
330 
331         "bnez       %[h],       1b                                      \n\t"
332         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
333           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
334           RESTRICT_ASM_LOW32
335           RESTRICT_ASM_ADDRT
336           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
337           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
338           [src2]"+&r"(src2),                [h]"+&r"(h)
339         : [dst_stride]"r"((mips_reg)dst_stride),
340           [src_stride1]"r"((mips_reg)src_stride1),
341           [src_stride2]"r"((mips_reg)src_stride2)
342         : "memory"
343     );
344 }
345 
ff_put_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)346 inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
347     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
348     int h)
349 {
350     double ftmp[4];
351     mips_reg addr[5];
352     DECLARE_VAR_ALL64;
353     DECLARE_VAR_ADDRT;
354 
355     __asm__ volatile (
356         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
357         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
358         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
359 
360         "1:                                                             \n\t"
361         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
362         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
363         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
364         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
365         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
366         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
367         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
368         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
369         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
370         MMI_SDC1(%[ftmp0], %[dst], 0x00)
371         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
372         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
373         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
374 
375         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
376         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
377         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
378         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
379         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
380         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
381         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
382         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
383         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
384         MMI_SDC1(%[ftmp0], %[dst], 0x00)
385         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
386         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
387         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
388 
389         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
390         "bnez       %[h],       1b                                      \n\t"
391         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
392           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
393           RESTRICT_ASM_ALL64
394           RESTRICT_ASM_ADDRT
395           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
396           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
397           [addr4]"=&r"(addr[4]),
398           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
399           [src2]"+&r"(src2),                [h]"+&r"(h)
400         : [dst_stride]"r"((mips_reg)dst_stride),
401           [src_stride1]"r"((mips_reg)src_stride1),
402           [src_stride2]"r"((mips_reg)src_stride2)
403         : "memory"
404     );
405 }
406 
ff_put_pixels16_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)407 inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
408     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
409     int h)
410 {
411     double ftmp[8];
412     mips_reg addr[5];
413     DECLARE_VAR_ALL64;
414     DECLARE_VAR_ADDRT;
415 
416     __asm__ volatile (
417         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
418         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
419         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
420 
421         "1:                                                             \n\t"
422         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
423         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
424         MMI_ULDC1(%[ftmp4], %[src1], 0x08)
425         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
426         MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
427         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
428         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
429         MMI_ULDC1(%[ftmp6], %[src2], 0x08)
430         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
431         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
432         MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
433         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
434         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
435         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
436         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
437         MMI_SDC1(%[ftmp0], %[dst], 0x00)
438         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
439         MMI_SDC1(%[ftmp4], %[dst], 0x08)
440         MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
441         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
442         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
443 
444         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
445         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
446         MMI_ULDC1(%[ftmp4], %[src1], 0x08)
447         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
448         MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
449         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
450         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
451         MMI_ULDC1(%[ftmp6], %[src2], 0x08)
452         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
453         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
454         MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
455         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
456         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
457         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
458         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
459         MMI_SDC1(%[ftmp0], %[dst], 0x00)
460         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
461         MMI_SDC1(%[ftmp4], %[dst], 0x08)
462         MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
463         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
464         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
465 
466         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
467         "bnez       %[h],       1b                                      \n\t"
468         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
469           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
470           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
471           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
472           RESTRICT_ASM_ALL64
473           RESTRICT_ASM_ADDRT
474           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
475           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
476           [addr4]"=&r"(addr[4]),
477           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
478           [src2]"+&r"(src2),                [h]"+&r"(h)
479         : [dst_stride]"r"((mips_reg)dst_stride),
480           [src_stride1]"r"((mips_reg)src_stride1),
481           [src_stride2]"r"((mips_reg)src_stride2)
482         : "memory"
483     );
484 }
485 
ff_avg_pixels4_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)486 inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
487     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
488     int h)
489 {
490     double ftmp[6];
491     mips_reg addr[6];
492     DECLARE_VAR_LOW32;
493 
494     __asm__ volatile (
495         "1:                                                             \n\t"
496         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
497         MMI_ULWC1(%[ftmp0], %[src1], 0x00)
498         MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
499         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
500         MMI_ULWC1(%[ftmp2], %[src2], 0x00)
501         MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
502         PTR_ADDU   "%[src1],    %[addr0],       %[src_stride1]          \n\t"
503         PTR_ADDU   "%[src2],    %[addr1],       %[src_stride2]          \n\t"
504         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
505         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
506         PTR_ADDU   "%[addr2],   %[dst],         %[dst_stride]           \n\t"
507         MMI_ULWC1(%[ftmp4], %[dst], 0x00)
508         MMI_ULWC1(%[ftmp5], %[addr2], 0x00)
509         PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
510         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
511         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
512         MMI_SWC1(%[ftmp0], %[dst], 0x00)
513         MMI_SWC1(%[ftmp1], %[addr2], 0x00)
514         PTR_ADDU   "%[dst],     %[addr2],       %[dst_stride]           \n\t"
515 
516         "bnez       %[h],       1b                                      \n\t"
517         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
518           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
519           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
520           RESTRICT_ASM_LOW32
521           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
522           [addr2]"=&r"(addr[2]),
523           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
524           [src2]"+&r"(src2),                [h]"+&r"(h)
525         : [dst_stride]"r"((mips_reg)dst_stride),
526           [src_stride1]"r"((mips_reg)src_stride1),
527           [src_stride2]"r"((mips_reg)src_stride2)
528         : "memory"
529     );
530 }
531 
ff_avg_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)532 inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
533     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
534     int h)
535 {
536     double ftmp[6];
537     mips_reg addr[6];
538     DECLARE_VAR_ALL64;
539     DECLARE_VAR_ADDRT;
540 
541     __asm__ volatile (
542         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
543         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
544         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
545 
546         "1:                                                             \n\t"
547         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
548         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
549         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
550         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
551         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
552         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
553         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
554         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
555         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
556         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
557         MMI_ULDC1(%[ftmp4], %[dst], 0x00)
558         MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
559         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
560         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
561         MMI_SDC1(%[ftmp0], %[dst], 0x00)
562         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
563         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
564         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
565 
566         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
567         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
568         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
569         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
570         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
571         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
572         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
573         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
574         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
575         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
576         MMI_ULDC1(%[ftmp4], %[dst], 0x00)
577         MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
578         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
579         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
580         MMI_SDC1(%[ftmp0], %[dst], 0x00)
581         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
582         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
583         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
584 
585         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
586         "bnez       %[h],       1b                                      \n\t"
587         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
588           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
589           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
590           RESTRICT_ASM_ALL64
591           RESTRICT_ASM_ADDRT
592           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
593           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
594           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
595           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
596           [src2]"+&r"(src2),                [h]"+&r"(h)
597         : [dst_stride]"r"((mips_reg)dst_stride),
598           [src_stride1]"r"((mips_reg)src_stride1),
599           [src_stride2]"r"((mips_reg)src_stride2)
600         : "memory"
601     );
602 }
603 
ff_avg_pixels16_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)604 inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
605     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
606     int h)
607 {
608     ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1,
609             src_stride2, h);
610     ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride,
611             src_stride1, src_stride2, h);
612 }
613 
ff_put_pixels4_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)614 void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
615     ptrdiff_t line_size, int h)
616 {
617     ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
618             line_size, h);
619 }
620 
ff_put_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)621 void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
622     ptrdiff_t line_size, int h)
623 {
624     ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
625             line_size, h);
626 }
627 
ff_put_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)628 void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
629     ptrdiff_t line_size, int h)
630 {
631     ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
632             line_size, h);
633 }
634 
ff_avg_pixels4_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)635 void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
636     ptrdiff_t line_size, int h)
637 {
638     ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
639             line_size, h);
640 }
641 
ff_avg_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)642 void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
643     ptrdiff_t line_size, int h)
644 {
645     ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
646             line_size, h);
647 }
648 
ff_avg_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)649 void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
650     ptrdiff_t line_size, int h)
651 {
652     ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h);
653     ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
654 }
655 
ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)656 inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
657     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
658     int h)
659 {
660     double ftmp[5];
661     mips_reg addr[5];
662     DECLARE_VAR_ALL64;
663     DECLARE_VAR_ADDRT;
664 
665     __asm__ volatile (
666         "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
667         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
668         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
669         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
670 
671         "1:                                                             \n\t"
672         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
673         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
674         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
675         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
676         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
677         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
678         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
679         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
680         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
681         "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
682         "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
683         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
684         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
685         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
686         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
687         MMI_SDC1(%[ftmp0], %[dst], 0x00)
688         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
689         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
690         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
691 
692         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
693         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
694         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
695         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
696         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
697         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
698         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
699         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
700         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
701         "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
702         "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
703         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
704         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
705         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
706         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
707         MMI_SDC1(%[ftmp0], %[dst], 0x00)
708         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
709         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
710         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
711 
712         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
713         "bnez       %[h],       1b                                      \n\t"
714         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
715           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
716           [ftmp4]"=&f"(ftmp[4]),
717           RESTRICT_ASM_ALL64
718           RESTRICT_ASM_ADDRT
719           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
720           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
721           [addr4]"=&r"(addr[4]),
722           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
723           [src2]"+&r"(src2),                [h]"+&r"(h)
724         : [dst_stride]"r"((mips_reg)dst_stride),
725           [src_stride1]"r"((mips_reg)src_stride1),
726           [src_stride2]"r"((mips_reg)src_stride2)
727         : "memory"
728     );
729 }
730 
ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)731 void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
732     ptrdiff_t line_size, int h)
733 {
734     ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size,
735             line_size, line_size, h);
736 }
737 
ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)738 void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
739     ptrdiff_t line_size, int h)
740 {
741     ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h);
742     ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
743 }
744 
ff_put_pixels4_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)745 void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
746     ptrdiff_t line_size, int h)
747 {
748     ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
749             line_size, line_size, h);
750 }
751 
ff_put_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)752 void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
753     ptrdiff_t line_size, int h)
754 {
755     ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
756             line_size, line_size, h);
757 }
758 
ff_put_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)759 void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
760     ptrdiff_t line_size, int h)
761 {
762     ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size,
763             line_size, line_size, h);
764 }
765 
ff_avg_pixels4_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)766 void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
767     ptrdiff_t line_size, int h)
768 {
769     ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
770             line_size, line_size, h);
771 }
772 
ff_avg_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)773 void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
774     ptrdiff_t line_size, int h)
775 {
776     ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
777             line_size, line_size, h);
778 }
779 
ff_avg_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)780 void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
781     ptrdiff_t line_size, int h)
782 {
783     ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h);
784     ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h);
785 }
786 
ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)787 void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
788     ptrdiff_t line_size, int h)
789 {
790     ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size,
791             line_size, line_size, line_size, h);
792 }
793 
ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)794 void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
795     ptrdiff_t line_size, int h)
796 {
797     ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h);
798     ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h);
799 }
800 
ff_put_pixels4_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)801 void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
802     ptrdiff_t line_size, int h)
803 {
804     /* FIXME HIGH BIT DEPTH */
805     int i;
806     const uint32_t a = AV_RN32(pixels);
807     const uint32_t b = AV_RN32(pixels + 1);
808     uint32_t l0 = (a & 0x03030303UL) +
809                   (b & 0x03030303UL) +
810                        0x02020202UL;
811     uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
812                   ((b & 0xFCFCFCFCUL) >> 2);
813     uint32_t l1, h1;
814 
815     pixels += line_size;
816     for (i = 0; i < h; i += 2) {
817         uint32_t a = AV_RN32(pixels);
818         uint32_t b = AV_RN32(pixels + 1);
819         l1 = (a & 0x03030303UL) +
820              (b & 0x03030303UL);
821         h1 = ((a & 0xFCFCFCFCUL) >> 2) +
822              ((b & 0xFCFCFCFCUL) >> 2);
823         *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
824         pixels += line_size;
825         block  += line_size;
826         a  = AV_RN32(pixels);
827         b  = AV_RN32(pixels + 1);
828         l0 = (a & 0x03030303UL) +
829              (b & 0x03030303UL) +
830                   0x02020202UL;
831         h0 = ((a & 0xFCFCFCFCUL) >> 2) +
832              ((b & 0xFCFCFCFCUL) >> 2);
833         *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
834         pixels += line_size;
835         block  += line_size;
836     }
837 }
838 
ff_put_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)839 void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
840     ptrdiff_t line_size, int h)
841 {
842 #if 1
843     double ftmp[10];
844     mips_reg addr[2];
845     DECLARE_VAR_ALL64;
846     DECLARE_VAR_ADDRT;
847 
848     __asm__ volatile (
849         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
850         "dli        %[addr0],   0x0f                                    \n\t"
851         "pcmpeqw    %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
852         "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
853         "dli        %[addr0],   0x01                                    \n\t"
854         "psrlh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
855         "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
856         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
857 
858         "dli        %[addr0],   0x02                                    \n\t"
859         "dmtc1      %[addr0],   %[ftmp9]                                \n\t"
860         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
861         MMI_ULDC1(%[ftmp4], %[pixels], 0x01)
862         "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
863         "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
864         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
865         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
866         "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
867         "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
868         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
869         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
870         "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
871         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
872         ".p2align   3                                                   \n\t"
873 
874         "1:                                                             \n\t"
875         PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
876         MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
877         MMI_ULDC1(%[ftmp2], %[addr1], 0x01)
878         "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
879         "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
880         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
881         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
882         "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
883         "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
884         "paddush    %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
885         "paddush    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
886         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
887         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
888         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
889         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
890         "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
891         "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
892         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
893         MMI_SDXC1(%[ftmp4], %[block], %[addr0], 0x00)
894         PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
895         PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
896         MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
897         MMI_ULDC1(%[ftmp4], %[addr1], 0x01)
898         "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
899         "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
900         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
901         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
902         "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
903         "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
904         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
905         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
906         "paddush    %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
907         "paddush    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
908         "paddush    %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
909         "paddush    %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
910         "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
911         "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
912         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
913         MMI_SDXC1(%[ftmp0], %[block], %[addr0], 0x00)
914         PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
915         PTR_ADDU   "%[h],       %[h],           -0x02                   \n\t"
916         "bnez       %[h],       1b                                      \n\t"
917         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
918           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
919           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
920           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
921           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
922           RESTRICT_ASM_ALL64
923           RESTRICT_ASM_ADDRT
924           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
925           [h]"+&r"(h),                      [pixels]"+&r"(pixels)
926         : [block]"r"(block),                [line_size]"r"((mips_reg)line_size)
927         : "memory"
928     );
929 #else
930     /* FIXME HIGH BIT DEPTH */
931     int j;
932 
933     for (j = 0; j < 2; j++) {
934         int i;
935         const uint32_t a = AV_RN32(pixels);
936         const uint32_t b = AV_RN32(pixels + 1);
937         uint32_t l0 = (a & 0x03030303UL) +
938                       (b & 0x03030303UL) +
939                            0x02020202UL;
940         uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
941                       ((b & 0xFCFCFCFCUL) >> 2);
942         uint32_t l1, h1;
943 
944         pixels += line_size;
945         for (i = 0; i < h; i += 2) {
946             uint32_t a = AV_RN32(pixels);
947             uint32_t b = AV_RN32(pixels + 1);
948             l1 = (a & 0x03030303UL) +
949                  (b & 0x03030303UL);
950             h1 = ((a & 0xFCFCFCFCUL) >> 2) +
951                  ((b & 0xFCFCFCFCUL) >> 2);
952             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
953             pixels += line_size;
954             block  += line_size;
955             a  = AV_RN32(pixels);
956             b  = AV_RN32(pixels + 1);
957             l0 = (a & 0x03030303UL) +
958                  (b & 0x03030303UL) +
959                       0x02020202UL;
960             h0 = ((a & 0xFCFCFCFCUL) >> 2) +
961                  ((b & 0xFCFCFCFCUL) >> 2);
962             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
963             pixels += line_size;
964             block  += line_size;
965         }
966         pixels += 4 - line_size * (h + 1);
967         block  += 4 - line_size * h;
968     }
969 #endif
970 }
971 
ff_put_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)972 void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
973     ptrdiff_t line_size, int h)
974 {
975     ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h);
976     ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
977 }
978 
ff_avg_pixels4_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)979 void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
980     ptrdiff_t line_size, int h)
981 {
982     /* FIXME HIGH BIT DEPTH */
983     int i;
984     const uint32_t a = AV_RN32(pixels);
985     const uint32_t b = AV_RN32(pixels + 1);
986     uint32_t l0 = (a & 0x03030303UL) +
987                   (b & 0x03030303UL) +
988                        0x02020202UL;
989     uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
990                   ((b & 0xFCFCFCFCUL) >> 2);
991     uint32_t l1, h1;
992 
993     pixels += line_size;
994     for (i = 0; i < h; i += 2) {
995         uint32_t a = AV_RN32(pixels);
996         uint32_t b = AV_RN32(pixels + 1);
997         l1 = (a & 0x03030303UL) +
998              (b & 0x03030303UL);
999         h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1000              ((b & 0xFCFCFCFCUL) >> 2);
1001         *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1002         pixels += line_size;
1003         block  += line_size;
1004         a  = AV_RN32(pixels);
1005         b  = AV_RN32(pixels + 1);
1006         l0 = (a & 0x03030303UL) +
1007              (b & 0x03030303UL) +
1008                   0x02020202UL;
1009         h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1010              ((b & 0xFCFCFCFCUL) >> 2);
1011         *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1012         pixels += line_size;
1013         block  += line_size;
1014     }
1015 }
1016 
ff_avg_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1017 void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1018     ptrdiff_t line_size, int h)
1019 {
1020     /* FIXME HIGH BIT DEPTH */
1021     int j;
1022 
1023     for (j = 0; j < 2; j++) {
1024         int i;
1025         const uint32_t a = AV_RN32(pixels);
1026         const uint32_t b = AV_RN32(pixels + 1);
1027         uint32_t l0 = (a & 0x03030303UL) +
1028                       (b & 0x03030303UL) +
1029                            0x02020202UL;
1030         uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1031                       ((b & 0xFCFCFCFCUL) >> 2);
1032         uint32_t l1, h1;
1033 
1034         pixels += line_size;
1035         for (i = 0; i < h; i += 2) {
1036             uint32_t a = AV_RN32(pixels);
1037             uint32_t b = AV_RN32(pixels + 1);
1038             l1 = (a & 0x03030303UL) +
1039                  (b & 0x03030303UL);
1040             h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1041                  ((b & 0xFCFCFCFCUL) >> 2);
1042             *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1043             pixels += line_size;
1044             block  += line_size;
1045             a  = AV_RN32(pixels);
1046             b  = AV_RN32(pixels + 1);
1047             l0 = (a & 0x03030303UL) +
1048                  (b & 0x03030303UL) +
1049                       0x02020202UL;
1050             h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1051                  ((b & 0xFCFCFCFCUL) >> 2);
1052             *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1053             pixels += line_size;
1054             block  += line_size;
1055         }
1056         pixels += 4 - line_size * (h + 1);
1057         block  += 4 - line_size * h;
1058     }
1059 }
1060 
ff_avg_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1061 void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1062     ptrdiff_t line_size, int h)
1063 {
1064     ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1065     ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1066 }
1067 
ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1068 void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1069     ptrdiff_t line_size, int h)
1070 {
1071     /* FIXME HIGH BIT DEPTH */
1072     int j;
1073 
1074     for (j = 0; j < 2; j++) {
1075         int i;
1076         const uint32_t a = AV_RN32(pixels);
1077         const uint32_t b = AV_RN32(pixels + 1);
1078         uint32_t l0 = (a & 0x03030303UL) +
1079                       (b & 0x03030303UL) +
1080                            0x01010101UL;
1081         uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1082                       ((b & 0xFCFCFCFCUL) >> 2);
1083         uint32_t l1, h1;
1084 
1085         pixels += line_size;
1086         for (i = 0; i < h; i += 2) {
1087             uint32_t a = AV_RN32(pixels);
1088             uint32_t b = AV_RN32(pixels + 1);
1089             l1 = (a & 0x03030303UL) +
1090                  (b & 0x03030303UL);
1091             h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1092                  ((b & 0xFCFCFCFCUL) >> 2);
1093             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1094             pixels += line_size;
1095             block  += line_size;
1096             a  = AV_RN32(pixels);
1097             b  = AV_RN32(pixels + 1);
1098             l0 = (a & 0x03030303UL) +
1099                  (b & 0x03030303UL) +
1100                       0x01010101UL;
1101             h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1102                  ((b & 0xFCFCFCFCUL) >> 2);
1103             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1104             pixels += line_size;
1105             block  += line_size;
1106         }
1107         pixels += 4 - line_size * (h + 1);
1108         block  += 4 - line_size * h;
1109     }
1110 }
1111 
ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1112 void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1113     ptrdiff_t line_size, int h)
1114 {
1115     ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1116     ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1117 }
1118