1 /*
2 * Loongson SIMD optimized h264chroma
3 *
4 * Copyright (c) 2015 Loongson Technology Corporation Limited
5 * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6 * Zhang Shuangshuang <zhangshuangshuang@ict.ac.cn>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #include "h264chroma_mips.h"
26 #include "constants.h"
27 #include "libavutil/mips/mmiutils.h"
28
ff_put_h264_chroma_mc8_mmi(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)29 void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
30 int h, int x, int y)
31 {
32 int A = 64, B, C, D, E;
33 double ftmp[12];
34 uint64_t tmp[1];
35
36 if (!(x || y)) {
37 /* x=0, y=0, A=64 */
38 __asm__ volatile (
39 "1: \n\t"
40 MMI_ULDC1(%[ftmp0], %[src], 0x00)
41 PTR_ADDU "%[src], %[src], %[stride] \n\t"
42 MMI_ULDC1(%[ftmp1], %[src], 0x00)
43 PTR_ADDU "%[src], %[src], %[stride] \n\t"
44 MMI_ULDC1(%[ftmp2], %[src], 0x00)
45 PTR_ADDU "%[src], %[src], %[stride] \n\t"
46 MMI_ULDC1(%[ftmp3], %[src], 0x00)
47 PTR_ADDU "%[src], %[src], %[stride] \n\t"
48
49 "addi %[h], %[h], -0x04 \n\t"
50
51 MMI_SDC1(%[ftmp0], %[dst], 0x00)
52 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
53 MMI_SDC1(%[ftmp1], %[dst], 0x00)
54 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
55 MMI_SDC1(%[ftmp2], %[dst], 0x00)
56 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
57 MMI_SDC1(%[ftmp3], %[dst], 0x00)
58 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
59 "bnez %[h], 1b \n\t"
60 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
61 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
62 [dst]"+&r"(dst), [src]"+&r"(src),
63 [h]"+&r"(h)
64 : [stride]"r"((mips_reg)stride)
65 : "memory"
66 );
67 } else if (x && y) {
68 /* x!=0, y!=0 */
69 D = x * y;
70 B = (x << 3) - D;
71 C = (y << 3) - D;
72 A = 64 - D - B - C;
73
74 __asm__ volatile (
75 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
76 "dli %[tmp0], 0x06 \n\t"
77 "pshufh %[A], %[A], %[ftmp0] \n\t"
78 "pshufh %[B], %[B], %[ftmp0] \n\t"
79 "mtc1 %[tmp0], %[ftmp9] \n\t"
80 "pshufh %[C], %[C], %[ftmp0] \n\t"
81 "pshufh %[D], %[D], %[ftmp0] \n\t"
82
83 "1: \n\t"
84 MMI_ULDC1(%[ftmp1], %[src], 0x00)
85 MMI_ULDC1(%[ftmp2], %[src], 0x01)
86 PTR_ADDU "%[src], %[src], %[stride] \n\t"
87 MMI_ULDC1(%[ftmp3], %[src], 0x00)
88 MMI_ULDC1(%[ftmp4], %[src], 0x01)
89 PTR_ADDU "%[src], %[src], %[stride] \n\t"
90 MMI_ULDC1(%[ftmp10], %[src], 0x00)
91 MMI_ULDC1(%[ftmp11], %[src], 0x01)
92 "addi %[h], %[h], -0x02 \n\t"
93
94 "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
95 "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
96 "punpcklbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
97 "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
98 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
99 "pmullh %[ftmp7], %[ftmp7], %[B] \n\t"
100 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
101 "pmullh %[ftmp6], %[ftmp6], %[A] \n\t"
102 "pmullh %[ftmp8], %[ftmp8], %[B] \n\t"
103 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
104 "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
105 "punpckhbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
106 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
107 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
108 "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
109 "pmullh %[ftmp7], %[ftmp7], %[D] \n\t"
110 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
111 "pmullh %[ftmp6], %[ftmp6], %[C] \n\t"
112 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t"
113 "paddh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
114 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
115 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
116 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
117 "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
118 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
119 "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
120 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
121
122 "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
123 "punpckhbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
124 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
125 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
126 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
127 "pmullh %[ftmp7], %[ftmp7], %[B] \n\t"
128 "paddh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
129 "pmullh %[ftmp6], %[ftmp6], %[A] \n\t"
130 "pmullh %[ftmp8], %[ftmp8], %[B] \n\t"
131 "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
132 "punpcklbh %[ftmp5], %[ftmp10], %[ftmp0] \n\t"
133 "punpckhbh %[ftmp6], %[ftmp10], %[ftmp0] \n\t"
134 "punpcklbh %[ftmp7], %[ftmp11], %[ftmp0] \n\t"
135 "punpckhbh %[ftmp8], %[ftmp11], %[ftmp0] \n\t"
136 "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
137 "pmullh %[ftmp7], %[ftmp7], %[D] \n\t"
138 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
139 "pmullh %[ftmp6], %[ftmp6], %[C] \n\t"
140 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t"
141 "paddh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
142 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
143 "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
144 "paddh %[ftmp3], %[ftmp3], %[ff_pw_32] \n\t"
145 "paddh %[ftmp4], %[ftmp4], %[ff_pw_32] \n\t"
146 "psrlh %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
147 "psrlh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
148 "packushb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
149
150 MMI_SDC1(%[ftmp1], %[dst], 0x00)
151 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
152 MMI_SDC1(%[ftmp3], %[dst], 0x00)
153 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
154 "bnez %[h], 1b \n\t"
155 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
156 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
157 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
158 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
159 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
160 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
161 [tmp0]"=&r"(tmp[0]),
162 [dst]"+&r"(dst), [src]"+&r"(src),
163 [h]"+&r"(h)
164 : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
165 [A]"f"(A), [B]"f"(B),
166 [C]"f"(C), [D]"f"(D)
167 : "memory"
168 );
169 } else if (x) {
170 /* x!=0, y==0 */
171 E = x << 3;
172 A = 64 - E;
173
174 __asm__ volatile (
175 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
176 "dli %[tmp0], 0x06 \n\t"
177 "pshufh %[A], %[A], %[ftmp0] \n\t"
178 "pshufh %[E], %[E], %[ftmp0] \n\t"
179 "mtc1 %[tmp0], %[ftmp7] \n\t"
180
181 "1: \n\t"
182 MMI_ULDC1(%[ftmp1], %[src], 0x00)
183 MMI_ULDC1(%[ftmp2], %[src], 0x01)
184 "addi %[h], %[h], -0x01 \n\t"
185 PTR_ADDU "%[src], %[src], %[stride] \n\t"
186
187 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
188 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
189 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
190 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
191 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
192 "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
193 "paddh %[ftmp1], %[ftmp3], %[ftmp5] \n\t"
194 "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
195 "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
196 "paddh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
197
198 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
199 "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
200 "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
201 "psrlh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
202 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
203 MMI_SDC1(%[ftmp1], %[dst], 0x00)
204 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
205 "bnez %[h], 1b \n\t"
206 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
207 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
208 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
209 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
210 [tmp0]"=&r"(tmp[0]),
211 [dst]"+&r"(dst), [src]"+&r"(src),
212 [h]"+&r"(h)
213 : [stride]"r"((mips_reg)stride),
214 [ff_pw_32]"f"(ff_pw_32),
215 [A]"f"(A), [E]"f"(E)
216 : "memory"
217 );
218 } else {
219 /* x==0, y!=0 */
220 E = y << 3;
221 A = 64 - E;
222
223 __asm__ volatile (
224 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
225 "dli %[tmp0], 0x06 \n\t"
226 "pshufh %[A], %[A], %[ftmp0] \n\t"
227 "pshufh %[E], %[E], %[ftmp0] \n\t"
228 "mtc1 %[tmp0], %[ftmp7] \n\t"
229
230 "1: \n\t"
231 MMI_ULDC1(%[ftmp1], %[src], 0x00)
232 PTR_ADDU "%[src], %[src], %[stride] \n\t"
233 MMI_ULDC1(%[ftmp2], %[src], 0x00)
234 PTR_ADDU "%[src], %[src], %[stride] \n\t"
235 MMI_ULDC1(%[ftmp8], %[src], 0x00)
236 "addi %[h], %[h], -0x02 \n\t"
237
238 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
239 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
240 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
241 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
242 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
243 "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
244 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
245 "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
246 "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
247 "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
248 "paddh %[ftmp3], %[ftmp3], %[ff_pw_32] \n\t"
249 "paddh %[ftmp4], %[ftmp4], %[ff_pw_32] \n\t"
250 "psrlh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
251 "psrlh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
252 "packushb %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
253
254 "punpcklbh %[ftmp3], %[ftmp2], %[ftmp0] \n\t"
255 "punpckhbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
256 "punpcklbh %[ftmp5], %[ftmp8], %[ftmp0] \n\t"
257 "punpckhbh %[ftmp6], %[ftmp8], %[ftmp0] \n\t"
258 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
259 "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
260 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
261 "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
262 "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
263 "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
264 "paddh %[ftmp3], %[ftmp3], %[ff_pw_32] \n\t"
265 "paddh %[ftmp4], %[ftmp4], %[ff_pw_32] \n\t"
266 "psrlh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
267 "psrlh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
268 "packushb %[ftmp2], %[ftmp3], %[ftmp4] \n\t"
269
270 MMI_SDC1(%[ftmp1], %[dst], 0x00)
271 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
272 MMI_SDC1(%[ftmp2], %[dst], 0x00)
273 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
274 "bnez %[h], 1b \n\t"
275 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
276 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
277 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
278 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
279 [ftmp8]"=&f"(ftmp[8]), [tmp0]"=&r"(tmp[0]),
280 [dst]"+&r"(dst), [src]"+&r"(src),
281 [h]"+&r"(h)
282 : [stride]"r"((mips_reg)stride),
283 [ff_pw_32]"f"(ff_pw_32),
284 [A]"f"(A), [E]"f"(E)
285 : "memory"
286 );
287 }
288 }
289
ff_avg_h264_chroma_mc8_mmi(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)290 void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
291 int h, int x, int y)
292 {
293 int A = 64, B, C, D, E;
294 double ftmp[10];
295 uint64_t tmp[1];
296
297 if(!(x || y)){
298 /* x=0, y=0, A=64 */
299 __asm__ volatile (
300 "1: \n\t"
301 MMI_ULDC1(%[ftmp0], %[src], 0x00)
302 PTR_ADDU "%[src], %[src], %[stride] \n\t"
303 MMI_ULDC1(%[ftmp1], %[src], 0x00)
304 PTR_ADDU "%[src], %[src], %[stride] \n\t"
305 MMI_LDC1(%[ftmp2], %[dst], 0x00)
306 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
307 MMI_LDC1(%[ftmp3], %[dst], 0x00)
308 PTR_SUBU "%[dst], %[dst], %[stride] \n\t"
309 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
310 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
311 MMI_SDC1(%[ftmp0], %[dst], 0x00)
312 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
313 MMI_SDC1(%[ftmp1], %[dst], 0x00)
314 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
315 "addi %[h], %[h], -0x02 \n\t"
316 "bnez %[h], 1b \n\t"
317 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
318 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
319 [dst]"+&r"(dst), [src]"+&r"(src),
320 [h]"+&r"(h)
321 : [stride]"r"((mips_reg)stride)
322 : "memory"
323 );
324 } else if (x && y) {
325 /* x!=0, y!=0 */
326 D = x * y;
327 B = (x << 3) - D;
328 C = (y << 3) - D;
329 A = 64 - D - B - C;
330 __asm__ volatile (
331 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
332 "dli %[tmp0], 0x06 \n\t"
333 "pshufh %[A], %[A], %[ftmp0] \n\t"
334 "pshufh %[B], %[B], %[ftmp0] \n\t"
335 "mtc1 %[tmp0], %[ftmp9] \n\t"
336 "pshufh %[C], %[C], %[ftmp0] \n\t"
337 "pshufh %[D], %[D], %[ftmp0] \n\t"
338
339 "1: \n\t"
340 MMI_ULDC1(%[ftmp1], %[src], 0x00)
341 MMI_ULDC1(%[ftmp2], %[src], 0x01)
342 PTR_ADDU "%[src], %[src], %[stride] \n\t"
343 MMI_ULDC1(%[ftmp3], %[src], 0x00)
344 MMI_ULDC1(%[ftmp4], %[src], 0x01)
345 "addi %[h], %[h], -0x01 \n\t"
346
347 "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
348 "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
349 "punpcklbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
350 "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
351 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
352 "pmullh %[ftmp7], %[ftmp7], %[B] \n\t"
353 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
354 "pmullh %[ftmp6], %[ftmp6], %[A] \n\t"
355 "pmullh %[ftmp8], %[ftmp8], %[B] \n\t"
356 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
357
358 "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
359 "punpckhbh %[ftmp6], %[ftmp3], %[ftmp0] \n\t"
360 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t"
361 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
362 "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
363 "pmullh %[ftmp7], %[ftmp7], %[D] \n\t"
364 "paddh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
365 "pmullh %[ftmp6], %[ftmp6], %[C] \n\t"
366 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t"
367 "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
368
369 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
370 "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
371 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
372 "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
373 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
374 "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
375 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
376 MMI_LDC1(%[ftmp2], %[dst], 0x00)
377 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
378 MMI_SDC1(%[ftmp1], %[dst], 0x00)
379 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
380 "bnez %[h], 1b \n\t"
381 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
382 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
383 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
384 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
385 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
386 [tmp0]"=&r"(tmp[0]),
387 [dst]"+&r"(dst), [src]"+&r"(src),
388 [h]"+&r"(h)
389 : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
390 [A]"f"(A), [B]"f"(B),
391 [C]"f"(C), [D]"f"(D)
392 : "memory"
393 );
394 } else if (x) {
395 /* x!=0, y==0 */
396 E = x << 3;
397 A = 64 - E;
398 __asm__ volatile (
399 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
400 "dli %[tmp0], 0x06 \n\t"
401 "pshufh %[A], %[A], %[ftmp0] \n\t"
402 "pshufh %[E], %[E], %[ftmp0] \n\t"
403 "mtc1 %[tmp0], %[ftmp7] \n\t"
404
405 "1: \n\t"
406 MMI_ULDC1(%[ftmp1], %[src], 0x00)
407 MMI_ULDC1(%[ftmp2], %[src], 0x01)
408 PTR_ADDU "%[src], %[src], %[stride] \n\t"
409 "addi %[h], %[h], -0x01 \n\t"
410
411 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
412 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
413 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
414 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
415 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
416 "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
417 "paddh %[ftmp1], %[ftmp3], %[ftmp5] \n\t"
418 "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
419 "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
420 "paddh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
421
422 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
423 "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
424 "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
425 "psrlh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
426 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
427 MMI_LDC1(%[ftmp2], %[dst], 0x00)
428 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
429 MMI_SDC1(%[ftmp1], %[dst], 0x00)
430 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
431 "bnez %[h], 1b \n\t"
432 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
433 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
434 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
435 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
436 [tmp0]"=&r"(tmp[0]),
437 [dst]"+&r"(dst), [src]"+&r"(src),
438 [h]"+&r"(h)
439 : [stride]"r"((mips_reg)stride),
440 [ff_pw_32]"f"(ff_pw_32),
441 [A]"f"(A), [E]"f"(E)
442 : "memory"
443 );
444 } else {
445 /* x==0, y!=0 */
446 E = y << 3;
447 A = 64 - E;
448 __asm__ volatile (
449 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
450 "dli %[tmp0], 0x06 \n\t"
451 "pshufh %[A], %[A], %[ftmp0] \n\t"
452 "pshufh %[E], %[E], %[ftmp0] \n\t"
453 "mtc1 %[tmp0], %[ftmp7] \n\t"
454
455 "1: \n\t"
456 MMI_ULDC1(%[ftmp1], %[src], 0x00)
457 PTR_ADDU "%[src], %[src], %[stride] \n\t"
458 MMI_ULDC1(%[ftmp2], %[src], 0x00)
459 "addi %[h], %[h], -0x01 \n\t"
460
461 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
462 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
463 "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t"
464 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
465 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
466 "pmullh %[ftmp5], %[ftmp5], %[E] \n\t"
467 "paddh %[ftmp1], %[ftmp3], %[ftmp5] \n\t"
468 "pmullh %[ftmp4], %[ftmp4], %[A] \n\t"
469 "pmullh %[ftmp6], %[ftmp6], %[E] \n\t"
470 "paddh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
471
472 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
473 "paddh %[ftmp2], %[ftmp2], %[ff_pw_32] \n\t"
474 "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
475 "psrlh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
476 "packushb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
477 MMI_LDC1(%[ftmp2], %[dst], 0x00)
478 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
479 MMI_SDC1(%[ftmp1], %[dst], 0x00)
480 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
481 "bnez %[h], 1b \n\t"
482 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
483 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
484 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
485 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
486 [tmp0]"=&r"(tmp[0]),
487 [dst]"+&r"(dst), [src]"+&r"(src),
488 [h]"+&r"(h)
489 : [stride]"r"((mips_reg)stride),
490 [ff_pw_32]"f"(ff_pw_32),
491 [A]"f"(A), [E]"f"(E)
492 : "memory"
493 );
494 }
495 }
496
ff_put_h264_chroma_mc4_mmi(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)497 void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
498 int h, int x, int y)
499 {
500 const int A = (8 - x) * (8 - y);
501 const int B = x * (8 - y);
502 const int C = (8 - x) * y;
503 const int D = x * y;
504 const int E = B + C;
505 double ftmp[8];
506 uint64_t tmp[1];
507 mips_reg addr[1];
508 DECLARE_VAR_LOW32;
509
510 if (D) {
511 __asm__ volatile (
512 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
513 "dli %[tmp0], 0x06 \n\t"
514 "pshufh %[A], %[A], %[ftmp0] \n\t"
515 "pshufh %[B], %[B], %[ftmp0] \n\t"
516 "mtc1 %[tmp0], %[ftmp7] \n\t"
517 "pshufh %[C], %[C], %[ftmp0] \n\t"
518 "pshufh %[D], %[D], %[ftmp0] \n\t"
519
520 "1: \n\t"
521 MMI_ULWC1(%[ftmp1], %[src], 0x00)
522 MMI_ULWC1(%[ftmp2], %[src], 0x01)
523 PTR_ADDU "%[src], %[src], %[stride] \n\t"
524 MMI_ULWC1(%[ftmp3], %[src], 0x00)
525 MMI_ULWC1(%[ftmp4], %[src], 0x01)
526
527 "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
528 "punpcklbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
529 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
530 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t"
531 "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
532 "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
533 "punpcklbh %[ftmp6], %[ftmp4], %[ftmp0] \n\t"
534 "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
535 "pmullh %[ftmp6], %[ftmp6], %[D] \n\t"
536 "paddh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
537 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
538 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
539 "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
540 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
541
542 "addi %[h], %[h], -0x01 \n\t"
543 MMI_SWC1(%[ftmp1], %[dst], 0x00)
544 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
545 "bnez %[h], 1b \n\t"
546 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
547 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
548 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
549 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
550 [tmp0]"=&r"(tmp[0]),
551 RESTRICT_ASM_LOW32
552 [dst]"+&r"(dst), [src]"+&r"(src),
553 [h]"+&r"(h)
554 : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
555 [A]"f"(A), [B]"f"(B),
556 [C]"f"(C), [D]"f"(D)
557 : "memory"
558 );
559 } else if (E) {
560 const int step = C ? stride : 1;
561 __asm__ volatile (
562 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
563 "dli %[tmp0], 0x06 \n\t"
564 "pshufh %[A], %[A], %[ftmp0] \n\t"
565 "pshufh %[E], %[E], %[ftmp0] \n\t"
566 "mtc1 %[tmp0], %[ftmp5] \n\t"
567
568 "1: \n\t"
569 MMI_ULWC1(%[ftmp1], %[src], 0x00)
570 PTR_ADDU "%[addr0], %[src], %[step] \n\t"
571 MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
572 PTR_ADDU "%[src], %[src], %[stride] \n\t"
573 "addi %[h], %[h], -0x01 \n\t"
574 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
575 "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
576 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
577 "pmullh %[ftmp4], %[ftmp4], %[E] \n\t"
578 "paddh %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
579 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
580 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
581 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
582 MMI_SWC1(%[ftmp1], %[dst], 0x00)
583 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
584 "bnez %[h], 1b \n\t"
585 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
586 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
587 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
588 [tmp0]"=&r"(tmp[0]),
589 RESTRICT_ASM_LOW32
590 [addr0]"=&r"(addr[0]),
591 [dst]"+&r"(dst), [src]"+&r"(src),
592 [h]"+&r"(h)
593 : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
594 [ff_pw_32]"f"(ff_pw_32),
595 [A]"f"(A), [E]"f"(E)
596 : "memory"
597 );
598 } else {
599 __asm__ volatile (
600 "1: \n\t"
601 MMI_ULWC1(%[ftmp0], %[src], 0x00)
602 PTR_ADDU "%[src], %[src], %[stride] \n\t"
603 MMI_ULWC1(%[ftmp1], %[src], 0x00)
604 PTR_ADDU "%[src], %[src], %[stride] \n\t"
605 "addi %[h], %[h], -0x02 \n\t"
606 MMI_SWC1(%[ftmp0], %[dst], 0x00)
607 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
608 MMI_SWC1(%[ftmp1], %[dst], 0x00)
609 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
610 "bnez %[h], 1b \n\t"
611 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
612 [dst]"+&r"(dst), [src]"+&r"(src),
613 RESTRICT_ASM_LOW32
614 [h]"+&r"(h)
615 : [stride]"r"((mips_reg)stride)
616 : "memory"
617 );
618 }
619 }
620
ff_avg_h264_chroma_mc4_mmi(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)621 void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
622 int h, int x, int y)
623 {
624 const int A = (8 - x) *(8 - y);
625 const int B = x * (8 - y);
626 const int C = (8 - x) * y;
627 const int D = x * y;
628 const int E = B + C;
629 double ftmp[8];
630 uint64_t tmp[1];
631 mips_reg addr[1];
632 DECLARE_VAR_LOW32;
633
634 if (D) {
635 __asm__ volatile (
636 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
637 "dli %[tmp0], 0x06 \n\t"
638 "pshufh %[A], %[A], %[ftmp0] \n\t"
639 "pshufh %[B], %[B], %[ftmp0] \n\t"
640 "mtc1 %[tmp0], %[ftmp7] \n\t"
641 "pshufh %[C], %[C], %[ftmp0] \n\t"
642 "pshufh %[D], %[D], %[ftmp0] \n\t"
643
644 "1: \n\t"
645 MMI_ULWC1(%[ftmp1], %[src], 0x00)
646 MMI_ULWC1(%[ftmp2], %[src], 0x01)
647 PTR_ADDU "%[src], %[src], %[stride] \n\t"
648 MMI_ULWC1(%[ftmp3], %[src], 0x00)
649 MMI_ULWC1(%[ftmp4], %[src], 0x01)
650
651 "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
652 "punpcklbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
653 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t"
654 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t"
655 "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
656 "punpcklbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t"
657 "punpcklbh %[ftmp6], %[ftmp4], %[ftmp0] \n\t"
658 "pmullh %[ftmp5], %[ftmp5], %[C] \n\t"
659 "pmullh %[ftmp6], %[ftmp6], %[D] \n\t"
660 "paddh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
661 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
662 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
663 "psrlh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
664 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
665 MMI_LWC1(%[ftmp2], %[dst], 0x00)
666 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
667
668 "addi %[h], %[h], -0x01 \n\t"
669 MMI_SWC1(%[ftmp1], %[dst], 0x00)
670 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
671 "bnez %[h], 1b \n\t"
672 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
673 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
674 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
675 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
676 [tmp0]"=&r"(tmp[0]),
677 RESTRICT_ASM_LOW32
678 [dst]"+&r"(dst), [src]"+&r"(src),
679 [h]"+&r"(h)
680 : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
681 [A]"f"(A), [B]"f"(B),
682 [C]"f"(C), [D]"f"(D)
683 : "memory"
684 );
685 } else if (E) {
686 const int step = C ? stride : 1;
687 __asm__ volatile (
688 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
689 "dli %[tmp0], 0x06 \n\t"
690 "pshufh %[A], %[A], %[ftmp0] \n\t"
691 "pshufh %[E], %[E], %[ftmp0] \n\t"
692 "mtc1 %[tmp0], %[ftmp5] \n\t"
693
694 "1: \n\t"
695 MMI_ULWC1(%[ftmp1], %[src], 0x00)
696 PTR_ADDU "%[addr0], %[src], %[step] \n\t"
697 MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
698 PTR_ADDU "%[src], %[src], %[stride] \n\t"
699 "addi %[h], %[h], -0x01 \n\t"
700 "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t"
701 "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
702 "pmullh %[ftmp3], %[ftmp3], %[A] \n\t"
703 "pmullh %[ftmp4], %[ftmp4], %[E] \n\t"
704 "paddh %[ftmp1], %[ftmp3], %[ftmp4] \n\t"
705 "paddh %[ftmp1], %[ftmp1], %[ff_pw_32] \n\t"
706 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
707 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
708 MMI_LWC1(%[ftmp2], %[dst], 0x00)
709 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
710 MMI_SWC1(%[ftmp1], %[dst], 0x00)
711 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
712 "bnez %[h], 1b \n\t"
713 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
714 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
715 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
716 [tmp0]"=&r"(tmp[0]),
717 RESTRICT_ASM_LOW32
718 [addr0]"=&r"(addr[0]),
719 [dst]"+&r"(dst), [src]"+&r"(src),
720 [h]"+&r"(h)
721 : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
722 [ff_pw_32]"f"(ff_pw_32),
723 [A]"f"(A), [E]"f"(E)
724 : "memory"
725 );
726 } else {
727 __asm__ volatile (
728 "1: \n\t"
729 MMI_ULWC1(%[ftmp0], %[src], 0x00)
730 PTR_ADDU "%[src], %[src], %[stride] \n\t"
731 MMI_ULWC1(%[ftmp1], %[src], 0x00)
732 PTR_ADDU "%[src], %[src], %[stride] \n\t"
733 "addi %[h], %[h], -0x02 \n\t"
734 MMI_LWC1(%[ftmp2], %[dst], 0x00)
735 "pavgb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
736 MMI_SWC1(%[ftmp0], %[dst], 0x00)
737 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
738 MMI_LWC1(%[ftmp3], %[dst], 0x00)
739 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
740 MMI_SWC1(%[ftmp1], %[dst], 0x00)
741 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
742 "bnez %[h], 1b \n\t"
743 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
744 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
745 [dst]"+&r"(dst), [src]"+&r"(src),
746 RESTRICT_ASM_LOW32
747 [h]"+&r"(h)
748 : [stride]"r"((mips_reg)stride)
749 : "memory"
750 );
751 }
752 }
753