1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for Mips MMI.
20 #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
21
TransposeWx8_MMI(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)22 void TransposeWx8_MMI(const uint8_t* src,
23 int src_stride,
24 uint8_t* dst,
25 int dst_stride,
26 int width) {
27 uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
28 uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
29 uint8_t* src_tmp = nullptr;
30
31 __asm__ volatile(
32 "1: \n\t"
33 "ldc1 %[tmp12], 0x00(%[src]) \n\t"
34 "dadd %[src_tmp], %[src], %[src_stride] \n\t"
35 "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
36
37 /* tmp0 = (00 10 01 11 02 12 03 13) */
38 "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
39 /* tmp1 = (04 14 05 15 06 16 07 17) */
40 "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
41
42 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
43 "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
44 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
45 "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
46
47 /* tmp2 = (20 30 21 31 22 32 23 33) */
48 "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
49 /* tmp3 = (24 34 25 35 26 36 27 37) */
50 "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
51
52 /* tmp4 = (00 10 20 30 01 11 21 31) */
53 "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t"
54 /* tmp5 = (02 12 22 32 03 13 23 33) */
55 "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t"
56 /* tmp6 = (04 14 24 34 05 15 25 35) */
57 "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t"
58 /* tmp7 = (06 16 26 36 07 17 27 37) */
59 "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t"
60
61 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
62 "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
63 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
64 "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
65
66 /* tmp0 = (40 50 41 51 42 52 43 53) */
67 "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
68 /* tmp1 = (44 54 45 55 46 56 47 57) */
69 "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
70
71 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
72 "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
73 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
74 "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
75
76 /* tmp2 = (60 70 61 71 62 72 63 73) */
77 "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
78 /* tmp3 = (64 74 65 75 66 76 67 77) */
79 "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
80
81 /* tmp8 = (40 50 60 70 41 51 61 71) */
82 "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t"
83 /* tmp9 = (42 52 62 72 43 53 63 73) */
84 "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t"
85 /* tmp10 = (44 54 64 74 45 55 65 75) */
86 "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t"
87 /* tmp11 = (46 56 66 76 47 57 67 77) */
88 "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t"
89
90 /* tmp0 = (00 10 20 30 40 50 60 70) */
91 "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t"
92 /* tmp1 = (01 11 21 31 41 51 61 71) */
93 "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t"
94 "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
95 "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
96 "dadd %[dst], %[dst], %[dst_stride] \n\t"
97 "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
98 "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
99
100 /* tmp0 = (02 12 22 32 42 52 62 72) */
101 "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t"
102 /* tmp1 = (03 13 23 33 43 53 63 73) */
103 "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t"
104 "dadd %[dst], %[dst], %[dst_stride] \n\t"
105 "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
106 "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
107 "dadd %[dst], %[dst], %[dst_stride] \n\t"
108 "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
109 "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
110
111 /* tmp0 = (04 14 24 34 44 54 64 74) */
112 "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t"
113 /* tmp1 = (05 15 25 35 45 55 65 75) */
114 "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t"
115 "dadd %[dst], %[dst], %[dst_stride] \n\t"
116 "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
117 "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
118 "dadd %[dst], %[dst], %[dst_stride] \n\t"
119 "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
120 "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
121
122 /* tmp0 = (06 16 26 36 46 56 66 76) */
123 "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t"
124 /* tmp1 = (07 17 27 37 47 57 67 77) */
125 "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t"
126 "dadd %[dst], %[dst], %[dst_stride] \n\t"
127 "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
128 "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
129 "dadd %[dst], %[dst], %[dst_stride] \n\t"
130 "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
131 "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
132
133 "dadd %[dst], %[dst], %[dst_stride] \n\t"
134 "daddi %[src], %[src], 0x08 \n\t"
135 "daddi %[width], %[width], -0x08 \n\t"
136 "bnez %[width], 1b \n\t"
137
138 : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
139 [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
140 [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
141 [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
142 [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
143 [src_tmp] "+&r"(src_tmp)
144 : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
145 [dst_stride] "r"(dst_stride)
146 : "memory");
147 }
148
TransposeUVWx8_MMI(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)149 void TransposeUVWx8_MMI(const uint8_t* src,
150 int src_stride,
151 uint8_t* dst_a,
152 int dst_stride_a,
153 uint8_t* dst_b,
154 int dst_stride_b,
155 int width) {
156 uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
157 uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
158 uint8_t* src_tmp = nullptr;
159
160 __asm__ volatile(
161 "1: \n\t"
162 /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
163 "ldc1 %[tmp12], 0x00(%[src]) \n\t"
164 "dadd %[src_tmp], %[src], %[src_stride] \n\t"
165 /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
166 "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
167
168 /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
169 "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
170 /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
171 "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
172
173 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
174 /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
175 "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
176 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
177 /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
178 "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
179
180 /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
181 "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
182 /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
183 "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
184
185 /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
186 "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t"
187 /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
188 "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t"
189 /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
190 "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t"
191 /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
192 "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t"
193
194 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
195 /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
196 "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
197 /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
198 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
199 "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
200
201 /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
202 "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
203 /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
204 "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
205
206 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
207 /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
208 "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
209 /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
210 "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
211 "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
212
213 /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
214 "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
215 /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
216 "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
217
218 /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
219 "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t"
220 /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
221 "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t"
222 /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
223 "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t"
224 /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
225 "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t"
226
227 /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
228 "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t"
229 /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
230 "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t"
231 "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
232 "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
233 "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
234 "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
235
236 /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
237 "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t"
238 /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
239 "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t"
240 "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
241 "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
242 "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
243 "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
244 "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
245 "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
246
247 /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
248 "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t"
249 /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
250 "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t"
251 "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
252 "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
253 "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
254 "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
255 "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
256 "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
257
258 /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
259 "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t"
260 /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
261 "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t"
262 "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
263 "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
264 "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
265 "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
266 "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
267 "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
268
269 "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
270 "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
271 "daddiu %[src], %[src], 0x08 \n\t"
272 "daddi %[width], %[width], -0x04 \n\t"
273 "bnez %[width], 1b \n\t"
274
275 : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
276 [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
277 [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
278 [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
279 [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
280 [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
281 : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
282 [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
283 : "memory");
284 }
285
286 #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
287
288 #ifdef __cplusplus
289 } // extern "C"
290 } // namespace libyuv
291 #endif
292