• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Loongson SIMD optimized qpeldsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "hpeldsp_mips.h"
25 #include "libavcodec/bit_depth_template.c"
26 #include "libavutil/mips/mmiutils.h"
27 #include "constants.h"
28 
ff_put_pixels4_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)29 void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
30     ptrdiff_t line_size, int h)
31 {
32     double ftmp[4];
33     DECLARE_VAR_LOW32;
34 
35     __asm__ volatile (
36         "1:                                                             \n\t"
37         MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
38         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
39         MMI_ULWC1(%[ftmp1], %[pixels], 0x00)
40         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
41 
42         PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
43 
44         MMI_SWC1(%[ftmp0], %[block], 0x00)
45         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
46         MMI_SWC1(%[ftmp1], %[block], 0x00)
47         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
48 
49         "bnez       %[h],       1b                                      \n\t"
50         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
51           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
52           RESTRICT_ASM_LOW32
53           [block]"+&r"(block),              [pixels]"+&r"(pixels),
54           [h]"+&r"(h)
55         : [line_size]"r"((mips_reg)line_size)
56         : "memory"
57     );
58 }
59 
ff_put_pixels8_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)60 void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
61     ptrdiff_t line_size, int h)
62 {
63     double ftmp[4];
64     DECLARE_VAR_ALL64;
65 
66     __asm__ volatile (
67         "1:                                                             \n\t"
68         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
69         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
70         MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
71         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
72         MMI_ULDC1(%[ftmp2], %[pixels], 0x00)
73         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
74         MMI_ULDC1(%[ftmp3], %[pixels], 0x00)
75         PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
76 
77         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
78 
79         MMI_SDC1(%[ftmp0], %[block], 0x00)
80         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
81         MMI_SDC1(%[ftmp1], %[block], 0x00)
82         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
83         MMI_SDC1(%[ftmp2], %[block], 0x00)
84         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
85         MMI_SDC1(%[ftmp3], %[block], 0x00)
86         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
87 
88         "bnez       %[h],       1b                                      \n\t"
89         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
90           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
91           RESTRICT_ASM_ALL64
92           [block]"+&r"(block),              [pixels]"+&r"(pixels),
93           [h]"+&r"(h)
94         : [line_size]"r"((mips_reg)line_size)
95         : "memory"
96     );
97 }
98 
ff_put_pixels16_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)99 void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
100     ptrdiff_t line_size, int h)
101 {
102     double ftmp[8];
103     DECLARE_VAR_ALL64;
104 
105     __asm__ volatile (
106         "1:                                                            \n\t"
107         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
108         MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
109         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
110         MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
111         MMI_ULDC1(%[ftmp3], %[pixels], 0x08)
112         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
113         MMI_ULDC1(%[ftmp4], %[pixels], 0x00)
114         MMI_ULDC1(%[ftmp6], %[pixels], 0x08)
115         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
116         MMI_ULDC1(%[ftmp5], %[pixels], 0x00)
117         MMI_ULDC1(%[ftmp7], %[pixels], 0x08)
118         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
119 
120         PTR_ADDI   "%[h],       %[h],           -0x04                  \n\t"
121 
122         MMI_SDC1(%[ftmp0], %[block], 0x00)
123         MMI_SDC1(%[ftmp2], %[block], 0x08)
124         PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
125         MMI_SDC1(%[ftmp1], %[block], 0x00)
126         MMI_SDC1(%[ftmp3], %[block], 0x08)
127         PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
128         MMI_SDC1(%[ftmp4], %[block], 0x00)
129         MMI_SDC1(%[ftmp6], %[block], 0x08)
130         PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
131         MMI_SDC1(%[ftmp5], %[block], 0x00)
132         MMI_SDC1(%[ftmp7], %[block], 0x08)
133         PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
134 
135         "bnez       %[h],       1b                                     \n\t"
136         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
137           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
138           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
139           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
140           RESTRICT_ASM_ALL64
141           [block]"+&r"(block),              [pixels]"+&r"(pixels),
142           [h]"+&r"(h)
143         : [line_size]"r"((mips_reg)line_size)
144         : "memory"
145     );
146 }
147 
ff_avg_pixels4_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)148 void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
149     ptrdiff_t line_size, int h)
150 {
151     double ftmp[4];
152     mips_reg addr[2];
153     DECLARE_VAR_LOW32;
154 
155     __asm__ volatile (
156         "1:                                                             \n\t"
157         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
158         MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
159         MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
160         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
161         MMI_ULWC1(%[ftmp2], %[block], 0x00)
162         MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
163 
164         PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
165 
166         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
167         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
168         MMI_SWC1(%[ftmp0], %[block], 0x00)
169         MMI_SWC1(%[ftmp1], %[addr1], 0x00)
170         PTR_ADDU   "%[pixels],  %[addr0],       %[line_size]            \n\t"
171         PTR_ADDU   "%[block],   %[addr1],       %[line_size]            \n\t"
172 
173         "bnez       %[h],       1b                                      \n\t"
174         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
175           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
176           RESTRICT_ASM_LOW32
177           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
178           [block]"+&r"(block),              [pixels]"+&r"(pixels),
179           [h]"+&r"(h)
180         : [line_size]"r"((mips_reg)line_size)
181         : "memory"
182     );
183 }
184 
ff_avg_pixels8_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)185 void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
186     ptrdiff_t line_size, int h)
187 {
188     double ftmp[4];
189     mips_reg addr[3];
190     DECLARE_VAR_ALL64;
191     DECLARE_VAR_ADDRT;
192 
193     __asm__ volatile (
194         PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
195         "1:                                                             \n\t"
196         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
197         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
198         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
199         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
200         MMI_ULDC1(%[ftmp2], %[block], 0x00)
201         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
202         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
203         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
204         MMI_SDC1(%[ftmp0], %[block], 0x00)
205         MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
206         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
207         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
208 
209         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
210         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
211         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
212         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
213         MMI_ULDC1(%[ftmp2], %[block], 0x00)
214         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
215         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
216         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
217         MMI_SDC1(%[ftmp0], %[block], 0x00)
218         MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
219         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
220         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
221 
222         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
223         "bnez       %[h],       1b                                      \n\t"
224         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
225           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
226           RESTRICT_ASM_ALL64
227           RESTRICT_ASM_ADDRT
228           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
229           [addr2]"=&r"(addr[2]),
230           [block]"+&r"(block),              [pixels]"+&r"(pixels),
231           [h]"+&r"(h)
232         : [line_size]"r"((mips_reg)line_size)
233         : "memory"
234     );
235 }
236 
ff_avg_pixels16_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)237 void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
238     ptrdiff_t line_size, int h)
239 {
240     double ftmp[8];
241     mips_reg addr[1];
242     DECLARE_VAR_ALL64;
243 
244     __asm__ volatile (
245         "1:                                                             \n\t"
246         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
247         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
248         MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
249         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
250         MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
251         MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
252         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
253         MMI_ULDC1(%[ftmp2], %[block], 0x00)
254         MMI_ULDC1(%[ftmp6], %[block], 0x08)
255         PTR_ADDU   "%[addr0],   %[block],       %[line_size]            \n\t"
256         MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
257         MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
258         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
259         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
260         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
261         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
262         MMI_SDC1(%[ftmp0], %[block], 0x00)
263         MMI_SDC1(%[ftmp4], %[block], 0x08)
264         MMI_SDC1(%[ftmp1], %[addr0], 0x00)
265         MMI_SDC1(%[ftmp5], %[addr0], 0x08)
266         PTR_ADDU   "%[block],   %[addr0],       %[line_size]            \n\t"
267 
268         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
269         MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
270         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
271         MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
272         MMI_ULDC1(%[ftmp5], %[pixels], 0x08)
273         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
274         MMI_ULDC1(%[ftmp2], %[block], 0x00)
275         MMI_ULDC1(%[ftmp6], %[block], 0x08)
276         PTR_ADDU   "%[addr0],   %[block],       %[line_size]            \n\t"
277         MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
278         MMI_ULDC1(%[ftmp7], %[addr0], 0x08)
279         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
280         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
281         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
282         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
283         MMI_SDC1(%[ftmp0], %[block], 0x00)
284         MMI_SDC1(%[ftmp4], %[block], 0x08)
285         MMI_SDC1(%[ftmp1], %[addr0], 0x00)
286         MMI_SDC1(%[ftmp5], %[addr0], 0x08)
287         PTR_ADDU   "%[block],   %[addr0],       %[line_size]            \n\t"
288 
289         "bnez       %[h],       1b                                      \n\t"
290         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
291           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
292           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
293           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
294           RESTRICT_ASM_ALL64
295           [addr0]"=&r"(addr[0]),
296           [block]"+&r"(block),              [pixels]"+&r"(pixels),
297           [h]"+&r"(h)
298         : [line_size]"r"((mips_reg)line_size)
299         : "memory"
300     );
301 }
302 
ff_put_pixels4_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)303 inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
304     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
305     int h)
306 {
307     double ftmp[4];
308     mips_reg addr[5];
309     DECLARE_VAR_LOW32;
310     DECLARE_VAR_ADDRT;
311 
312     __asm__ volatile (
313         "1:                                                             \n\t"
314         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
315         MMI_ULWC1(%[ftmp0], %[src1], 0x00)
316         MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
317         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
318         MMI_ULWC1(%[ftmp2], %[src2], 0x00)
319         MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
320         PTR_ADDU   "%[src1],    %[addr0],       %[src_stride1]          \n\t"
321         PTR_ADDU   "%[src2],    %[addr1],       %[src_stride2]          \n\t"
322 
323         PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
324 
325         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
326         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
327         MMI_SWC1(%[ftmp0], %[dst], 0x00)
328         PTR_ADDU   "%[dst],     %[dst],         %[dst_stride]           \n\t"
329         MMI_SWC1(%[ftmp1], %[dst], 0x00)
330         PTR_ADDU   "%[dst],     %[dst],         %[dst_stride]           \n\t"
331 
332         "bnez       %[h],       1b                                      \n\t"
333         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
334           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
335           RESTRICT_ASM_LOW32
336           RESTRICT_ASM_ADDRT
337           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
338           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
339           [src2]"+&r"(src2),                [h]"+&r"(h)
340         : [dst_stride]"r"((mips_reg)dst_stride),
341           [src_stride1]"r"((mips_reg)src_stride1),
342           [src_stride2]"r"((mips_reg)src_stride2)
343         : "memory"
344     );
345 }
346 
ff_put_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)347 inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
348     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
349     int h)
350 {
351     double ftmp[4];
352     mips_reg addr[5];
353     DECLARE_VAR_ALL64;
354     DECLARE_VAR_ADDRT;
355 
356     __asm__ volatile (
357         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
358         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
359         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
360 
361         "1:                                                             \n\t"
362         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
363         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
364         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
365         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
366         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
367         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
368         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
369         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
370         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
371         MMI_SDC1(%[ftmp0], %[dst], 0x00)
372         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
373         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
374         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
375 
376         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
377         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
378         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
379         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
380         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
381         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
382         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
383         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
384         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
385         MMI_SDC1(%[ftmp0], %[dst], 0x00)
386         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
387         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
388         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
389 
390         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
391         "bnez       %[h],       1b                                      \n\t"
392         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
393           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
394           RESTRICT_ASM_ALL64
395           RESTRICT_ASM_ADDRT
396           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
397           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
398           [addr4]"=&r"(addr[4]),
399           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
400           [src2]"+&r"(src2),                [h]"+&r"(h)
401         : [dst_stride]"r"((mips_reg)dst_stride),
402           [src_stride1]"r"((mips_reg)src_stride1),
403           [src_stride2]"r"((mips_reg)src_stride2)
404         : "memory"
405     );
406 }
407 
ff_put_pixels16_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)408 inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
409     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
410     int h)
411 {
412     double ftmp[8];
413     mips_reg addr[5];
414     DECLARE_VAR_ALL64;
415     DECLARE_VAR_ADDRT;
416 
417     __asm__ volatile (
418         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
419         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
420         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
421 
422         "1:                                                             \n\t"
423         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
424         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
425         MMI_ULDC1(%[ftmp4], %[src1], 0x08)
426         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
427         MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
428         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
429         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
430         MMI_ULDC1(%[ftmp6], %[src2], 0x08)
431         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
432         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
433         MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
434         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
435         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
436         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
437         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
438         MMI_SDC1(%[ftmp0], %[dst], 0x00)
439         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
440         MMI_SDC1(%[ftmp4], %[dst], 0x08)
441         MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
442         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
443         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
444 
445         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
446         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
447         MMI_ULDC1(%[ftmp4], %[src1], 0x08)
448         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
449         MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
450         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
451         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
452         MMI_ULDC1(%[ftmp6], %[src2], 0x08)
453         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
454         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
455         MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
456         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
457         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
458         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
459         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
460         MMI_SDC1(%[ftmp0], %[dst], 0x00)
461         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
462         MMI_SDC1(%[ftmp4], %[dst], 0x08)
463         MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
464         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
465         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
466 
467         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
468         "bnez       %[h],       1b                                      \n\t"
469         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
470           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
471           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
472           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
473           RESTRICT_ASM_ALL64
474           RESTRICT_ASM_ADDRT
475           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
476           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
477           [addr4]"=&r"(addr[4]),
478           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
479           [src2]"+&r"(src2),                [h]"+&r"(h)
480         : [dst_stride]"r"((mips_reg)dst_stride),
481           [src_stride1]"r"((mips_reg)src_stride1),
482           [src_stride2]"r"((mips_reg)src_stride2)
483         : "memory"
484     );
485 }
486 
ff_avg_pixels4_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)487 inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
488     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
489     int h)
490 {
491     double ftmp[6];
492     mips_reg addr[6];
493     DECLARE_VAR_LOW32;
494 
495     __asm__ volatile (
496         "1:                                                             \n\t"
497         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
498         MMI_ULWC1(%[ftmp0], %[src1], 0x00)
499         MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
500         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
501         MMI_ULWC1(%[ftmp2], %[src2], 0x00)
502         MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
503         PTR_ADDU   "%[src1],    %[addr0],       %[src_stride1]          \n\t"
504         PTR_ADDU   "%[src2],    %[addr1],       %[src_stride2]          \n\t"
505         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
506         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
507         PTR_ADDU   "%[addr2],   %[dst],         %[dst_stride]           \n\t"
508         MMI_ULWC1(%[ftmp4], %[dst], 0x00)
509         MMI_ULWC1(%[ftmp5], %[addr2], 0x00)
510         PTR_ADDI   "%[h],       %[h],           -0x02                   \n\t"
511         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
512         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
513         MMI_SWC1(%[ftmp0], %[dst], 0x00)
514         MMI_SWC1(%[ftmp1], %[addr2], 0x00)
515         PTR_ADDU   "%[dst],     %[addr2],       %[dst_stride]           \n\t"
516 
517         "bnez       %[h],       1b                                      \n\t"
518         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
519           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
520           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
521           RESTRICT_ASM_LOW32
522           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
523           [addr2]"=&r"(addr[2]),
524           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
525           [src2]"+&r"(src2),                [h]"+&r"(h)
526         : [dst_stride]"r"((mips_reg)dst_stride),
527           [src_stride1]"r"((mips_reg)src_stride1),
528           [src_stride2]"r"((mips_reg)src_stride2)
529         : "memory"
530     );
531 }
532 
ff_avg_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)533 inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
534     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
535     int h)
536 {
537     double ftmp[6];
538     mips_reg addr[6];
539     DECLARE_VAR_ALL64;
540     DECLARE_VAR_ADDRT;
541 
542     __asm__ volatile (
543         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
544         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
545         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
546 
547         "1:                                                             \n\t"
548         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
549         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
550         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
551         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
552         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
553         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
554         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
555         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
556         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
557         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
558         MMI_ULDC1(%[ftmp4], %[dst], 0x00)
559         MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
560         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
561         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
562         MMI_SDC1(%[ftmp0], %[dst], 0x00)
563         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
564         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
565         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
566 
567         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
568         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
569         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
570         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
571         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
572         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
573         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
574         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
575         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
576         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
577         MMI_ULDC1(%[ftmp4], %[dst], 0x00)
578         MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
579         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
580         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
581         MMI_SDC1(%[ftmp0], %[dst], 0x00)
582         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
583         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
584         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
585 
586         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
587         "bnez       %[h],       1b                                      \n\t"
588         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
589           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
590           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
591           RESTRICT_ASM_ALL64
592           RESTRICT_ASM_ADDRT
593           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
594           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
595           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
596           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
597           [src2]"+&r"(src2),                [h]"+&r"(h)
598         : [dst_stride]"r"((mips_reg)dst_stride),
599           [src_stride1]"r"((mips_reg)src_stride1),
600           [src_stride2]"r"((mips_reg)src_stride2)
601         : "memory"
602     );
603 }
604 
ff_avg_pixels16_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)605 inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
606     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
607     int h)
608 {
609     ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1,
610             src_stride2, h);
611     ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride,
612             src_stride1, src_stride2, h);
613 }
614 
ff_put_pixels4_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)615 void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
616     ptrdiff_t line_size, int h)
617 {
618     ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
619             line_size, h);
620 }
621 
ff_put_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)622 void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
623     ptrdiff_t line_size, int h)
624 {
625     ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
626             line_size, h);
627 }
628 
ff_put_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)629 void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
630     ptrdiff_t line_size, int h)
631 {
632     ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
633             line_size, h);
634 }
635 
ff_avg_pixels4_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)636 void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
637     ptrdiff_t line_size, int h)
638 {
639     ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
640             line_size, h);
641 }
642 
ff_avg_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)643 void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
644     ptrdiff_t line_size, int h)
645 {
646     ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
647             line_size, h);
648 }
649 
ff_avg_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)650 void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
651     ptrdiff_t line_size, int h)
652 {
653     ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h);
654     ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
655 }
656 
ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,int dst_stride,int src_stride1,int src_stride2,int h)657 inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
658     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
659     int h)
660 {
661     double ftmp[5];
662     mips_reg addr[5];
663     DECLARE_VAR_ALL64;
664     DECLARE_VAR_ADDRT;
665 
666     __asm__ volatile (
667         "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
668         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
669         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
670         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
671 
672         "1:                                                             \n\t"
673         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
674         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
675         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
676         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
677         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
678         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
679         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
680         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
681         "pxor       %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
682         "pxor       %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
683         "pxor       %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
684         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
685         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
686         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
687         "pxor       %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
688         MMI_SDC1(%[ftmp0], %[dst], 0x00)
689         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
690         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
691         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
692 
693         MMI_ULDC1(%[ftmp0], %[src1], 0x00)
694         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
695         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
696         MMI_ULDC1(%[ftmp2], %[src2], 0x00)
697         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
698         MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
699         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
700         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
701         "pxor       %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
702         "pxor       %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
703         "pxor       %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
704         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
705         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
706         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
707         "pxor       %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
708         MMI_SDC1(%[ftmp0], %[dst], 0x00)
709         MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
710         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
711         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
712 
713         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
714         "bnez       %[h],       1b                                      \n\t"
715         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
716           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
717           [ftmp4]"=&f"(ftmp[4]),
718           RESTRICT_ASM_ALL64
719           RESTRICT_ASM_ADDRT
720           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
721           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
722           [addr4]"=&r"(addr[4]),
723           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
724           [src2]"+&r"(src2),                [h]"+&r"(h)
725         : [dst_stride]"r"((mips_reg)dst_stride),
726           [src_stride1]"r"((mips_reg)src_stride1),
727           [src_stride2]"r"((mips_reg)src_stride2)
728         : "memory"
729     );
730 }
731 
ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)732 void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
733     ptrdiff_t line_size, int h)
734 {
735     ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size,
736             line_size, line_size, h);
737 }
738 
ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)739 void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
740     ptrdiff_t line_size, int h)
741 {
742     ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h);
743     ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
744 }
745 
ff_put_pixels4_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)746 void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
747     ptrdiff_t line_size, int h)
748 {
749     ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
750             line_size, line_size, h);
751 }
752 
ff_put_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)753 void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
754     ptrdiff_t line_size, int h)
755 {
756     ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
757             line_size, line_size, h);
758 }
759 
ff_put_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)760 void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
761     ptrdiff_t line_size, int h)
762 {
763     ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size,
764             line_size, line_size, h);
765 }
766 
ff_avg_pixels4_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)767 void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
768     ptrdiff_t line_size, int h)
769 {
770     ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
771             line_size, line_size, h);
772 }
773 
ff_avg_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)774 void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
775     ptrdiff_t line_size, int h)
776 {
777     ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
778             line_size, line_size, h);
779 }
780 
ff_avg_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)781 void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
782     ptrdiff_t line_size, int h)
783 {
784     ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h);
785     ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h);
786 }
787 
ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)788 void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
789     ptrdiff_t line_size, int h)
790 {
791     ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size,
792             line_size, line_size, line_size, h);
793 }
794 
ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)795 void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
796     ptrdiff_t line_size, int h)
797 {
798     ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h);
799     ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h);
800 }
801 
ff_put_pixels4_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)802 void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
803     ptrdiff_t line_size, int h)
804 {
805     /* FIXME HIGH BIT DEPTH */
806     int i;
807     const uint32_t a = AV_RN32(pixels);
808     const uint32_t b = AV_RN32(pixels + 1);
809     uint32_t l0 = (a & 0x03030303UL) +
810                   (b & 0x03030303UL) +
811                        0x02020202UL;
812     uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
813                   ((b & 0xFCFCFCFCUL) >> 2);
814     uint32_t l1, h1;
815 
816     pixels += line_size;
817     for (i = 0; i < h; i += 2) {
818         uint32_t a = AV_RN32(pixels);
819         uint32_t b = AV_RN32(pixels + 1);
820         l1 = (a & 0x03030303UL) +
821              (b & 0x03030303UL);
822         h1 = ((a & 0xFCFCFCFCUL) >> 2) +
823              ((b & 0xFCFCFCFCUL) >> 2);
824         *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
825         pixels += line_size;
826         block  += line_size;
827         a  = AV_RN32(pixels);
828         b  = AV_RN32(pixels + 1);
829         l0 = (a & 0x03030303UL) +
830              (b & 0x03030303UL) +
831                   0x02020202UL;
832         h0 = ((a & 0xFCFCFCFCUL) >> 2) +
833              ((b & 0xFCFCFCFCUL) >> 2);
834         *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
835         pixels += line_size;
836         block  += line_size;
837     }
838 }
839 
ff_put_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)840 void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
841     ptrdiff_t line_size, int h)
842 {
843 #if 1
844     double ftmp[10];
845     mips_reg addr[2];
846     DECLARE_VAR_ALL64;
847     DECLARE_VAR_ADDRT;
848 
849     __asm__ volatile (
850         "pxor       %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
851         "dli        %[addr0],   0x0f                                    \n\t"
852         "pcmpeqw    %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
853         "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
854         "dli        %[addr0],   0x01                                    \n\t"
855         "psrlh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
856         "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
857         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
858 
859         "dli        %[addr0],   0x02                                    \n\t"
860         "dmtc1      %[addr0],   %[ftmp9]                                \n\t"
861         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
862         MMI_ULDC1(%[ftmp4], %[pixels], 0x01)
863         "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
864         "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
865         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
866         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
867         "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
868         "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
869         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
870         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
871         "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
872         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
873         ".p2align   3                                                   \n\t"
874 
875         "1:                                                             \n\t"
876         PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
877         MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
878         MMI_ULDC1(%[ftmp2], %[addr1], 0x01)
879         "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
880         "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
881         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
882         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
883         "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
884         "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
885         "paddush    %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
886         "paddush    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
887         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
888         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
889         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
890         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
891         "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
892         "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
893         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
894         MMI_SDXC1(%[ftmp4], %[block], %[addr0], 0x00)
895         PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
896         PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
897         MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
898         MMI_ULDC1(%[ftmp4], %[addr1], 0x01)
899         "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
900         "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
901         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
902         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
903         "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
904         "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
905         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
906         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
907         "paddush    %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
908         "paddush    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
909         "paddush    %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
910         "paddush    %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
911         "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
912         "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
913         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
914         MMI_SDXC1(%[ftmp0], %[block], %[addr0], 0x00)
915         PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
916         PTR_ADDU   "%[h],       %[h],           -0x02                   \n\t"
917         "bnez       %[h],       1b                                      \n\t"
918         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
919           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
920           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
921           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
922           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
923           RESTRICT_ASM_ALL64
924           RESTRICT_ASM_ADDRT
925           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
926           [h]"+&r"(h),                      [pixels]"+&r"(pixels)
927         : [block]"r"(block),                [line_size]"r"((mips_reg)line_size)
928         : "memory"
929     );
930 #else
931     /* FIXME HIGH BIT DEPTH */
932     int j;
933 
934     for (j = 0; j < 2; j++) {
935         int i;
936         const uint32_t a = AV_RN32(pixels);
937         const uint32_t b = AV_RN32(pixels + 1);
938         uint32_t l0 = (a & 0x03030303UL) +
939                       (b & 0x03030303UL) +
940                            0x02020202UL;
941         uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
942                       ((b & 0xFCFCFCFCUL) >> 2);
943         uint32_t l1, h1;
944 
945         pixels += line_size;
946         for (i = 0; i < h; i += 2) {
947             uint32_t a = AV_RN32(pixels);
948             uint32_t b = AV_RN32(pixels + 1);
949             l1 = (a & 0x03030303UL) +
950                  (b & 0x03030303UL);
951             h1 = ((a & 0xFCFCFCFCUL) >> 2) +
952                  ((b & 0xFCFCFCFCUL) >> 2);
953             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
954             pixels += line_size;
955             block  += line_size;
956             a  = AV_RN32(pixels);
957             b  = AV_RN32(pixels + 1);
958             l0 = (a & 0x03030303UL) +
959                  (b & 0x03030303UL) +
960                       0x02020202UL;
961             h0 = ((a & 0xFCFCFCFCUL) >> 2) +
962                  ((b & 0xFCFCFCFCUL) >> 2);
963             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
964             pixels += line_size;
965             block  += line_size;
966         }
967         pixels += 4 - line_size * (h + 1);
968         block  += 4 - line_size * h;
969     }
970 #endif
971 }
972 
ff_put_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)973 void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
974     ptrdiff_t line_size, int h)
975 {
976     ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h);
977     ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
978 }
979 
ff_avg_pixels4_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)980 void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
981     ptrdiff_t line_size, int h)
982 {
983     /* FIXME HIGH BIT DEPTH */
984     int i;
985     const uint32_t a = AV_RN32(pixels);
986     const uint32_t b = AV_RN32(pixels + 1);
987     uint32_t l0 = (a & 0x03030303UL) +
988                   (b & 0x03030303UL) +
989                        0x02020202UL;
990     uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
991                   ((b & 0xFCFCFCFCUL) >> 2);
992     uint32_t l1, h1;
993 
994     pixels += line_size;
995     for (i = 0; i < h; i += 2) {
996         uint32_t a = AV_RN32(pixels);
997         uint32_t b = AV_RN32(pixels + 1);
998         l1 = (a & 0x03030303UL) +
999              (b & 0x03030303UL);
1000         h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1001              ((b & 0xFCFCFCFCUL) >> 2);
1002         *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1003         pixels += line_size;
1004         block  += line_size;
1005         a  = AV_RN32(pixels);
1006         b  = AV_RN32(pixels + 1);
1007         l0 = (a & 0x03030303UL) +
1008              (b & 0x03030303UL) +
1009                   0x02020202UL;
1010         h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1011              ((b & 0xFCFCFCFCUL) >> 2);
1012         *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1013         pixels += line_size;
1014         block  += line_size;
1015     }
1016 }
1017 
ff_avg_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1018 void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1019     ptrdiff_t line_size, int h)
1020 {
1021     /* FIXME HIGH BIT DEPTH */
1022     int j;
1023 
1024     for (j = 0; j < 2; j++) {
1025         int i;
1026         const uint32_t a = AV_RN32(pixels);
1027         const uint32_t b = AV_RN32(pixels + 1);
1028         uint32_t l0 = (a & 0x03030303UL) +
1029                       (b & 0x03030303UL) +
1030                            0x02020202UL;
1031         uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1032                       ((b & 0xFCFCFCFCUL) >> 2);
1033         uint32_t l1, h1;
1034 
1035         pixels += line_size;
1036         for (i = 0; i < h; i += 2) {
1037             uint32_t a = AV_RN32(pixels);
1038             uint32_t b = AV_RN32(pixels + 1);
1039             l1 = (a & 0x03030303UL) +
1040                  (b & 0x03030303UL);
1041             h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1042                  ((b & 0xFCFCFCFCUL) >> 2);
1043             *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1044             pixels += line_size;
1045             block  += line_size;
1046             a  = AV_RN32(pixels);
1047             b  = AV_RN32(pixels + 1);
1048             l0 = (a & 0x03030303UL) +
1049                  (b & 0x03030303UL) +
1050                       0x02020202UL;
1051             h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1052                  ((b & 0xFCFCFCFCUL) >> 2);
1053             *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1054             pixels += line_size;
1055             block  += line_size;
1056         }
1057         pixels += 4 - line_size * (h + 1);
1058         block  += 4 - line_size * h;
1059     }
1060 }
1061 
ff_avg_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1062 void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1063     ptrdiff_t line_size, int h)
1064 {
1065     ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1066     ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1067 }
1068 
ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1069 void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1070     ptrdiff_t line_size, int h)
1071 {
1072     /* FIXME HIGH BIT DEPTH */
1073     int j;
1074 
1075     for (j = 0; j < 2; j++) {
1076         int i;
1077         const uint32_t a = AV_RN32(pixels);
1078         const uint32_t b = AV_RN32(pixels + 1);
1079         uint32_t l0 = (a & 0x03030303UL) +
1080                       (b & 0x03030303UL) +
1081                            0x01010101UL;
1082         uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1083                       ((b & 0xFCFCFCFCUL) >> 2);
1084         uint32_t l1, h1;
1085 
1086         pixels += line_size;
1087         for (i = 0; i < h; i += 2) {
1088             uint32_t a = AV_RN32(pixels);
1089             uint32_t b = AV_RN32(pixels + 1);
1090             l1 = (a & 0x03030303UL) +
1091                  (b & 0x03030303UL);
1092             h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1093                  ((b & 0xFCFCFCFCUL) >> 2);
1094             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1095             pixels += line_size;
1096             block  += line_size;
1097             a  = AV_RN32(pixels);
1098             b  = AV_RN32(pixels + 1);
1099             l0 = (a & 0x03030303UL) +
1100                  (b & 0x03030303UL) +
1101                       0x01010101UL;
1102             h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1103                  ((b & 0xFCFCFCFCUL) >> 2);
1104             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1105             pixels += line_size;
1106             block  += line_size;
1107         }
1108         pixels += 4 - line_size * (h + 1);
1109         block  += 4 - line_size * h;
1110     }
1111 }
1112 
ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)1113 void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1114     ptrdiff_t line_size, int h)
1115 {
1116     ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1117     ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1118 }
1119