1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18 /*********************************************************************************/
19 /* Filename: sad_mb_offset.h */
20 /* Description: Implementation for in-line functions used in dct.cpp */
21 /* Modified: */
22 /*********************************************************************************/
23
24 #if !defined(PV_ARM_GCC_V4) && !defined(PV_ARM_GCC_V5) /* ARM GNU COMPILER */
25
26 #if (NUMBER==3)
sad_mb_offset3(UChar * ref,UChar * blk,Int lx,Int dmin)27 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin)
28 #elif (NUMBER==2)
29 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin)
30 #elif (NUMBER==1)
31 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin)
32 #endif
33 {
34 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
35
36 // x5 = (x4<<8) - x4;
37 x4 = x5 = 0;
38 x6 = 0xFFFF00FF;
39 x9 = 0x80808080; /* const. */
40 ref -= NUMBER; /* bic ref, ref, #3 */
41 ref -= lx;
42 blk -= 16;
43 x8 = 16;
44
45 #if (NUMBER==3)
46 LOOP_SAD3:
47 #elif (NUMBER==2)
48 LOOP_SAD2:
49 #elif (NUMBER==1)
50 LOOP_SAD1:
51 #endif
52 /****** process 8 pixels ******/
53 x10 = *((uint32*)(ref += lx)); /* D C B A */
54 x11 = *((uint32*)(ref + 4)); /* H G F E */
55 x12 = *((uint32*)(ref + 8)); /* L K J I */
56
57 x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
58 x10 = x10 | (x11 << (32 - SHIFT)); /* G F E D */
59 x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
60 x11 = x11 | (x12 << (32 - SHIFT)); /* K J I H */
61
62 x12 = *((uint32*)(blk += 16));
63 x14 = *((uint32*)(blk + 4));
64
65 /* process x11 & x14 */
66 x11 = sad_4pixel(x11, x14, x9);
67
68 /* process x12 & x10 */
69 x10 = sad_4pixel(x10, x12, x9);
70
71 x5 = x5 + x10; /* accumulate low bytes */
72 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
73 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
74 x5 = x5 + x11; /* accumulate low bytes */
75 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
76 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
77
78 /****** process 8 pixels ******/
79 x10 = *((uint32*)(ref + 8)); /* D C B A */
80 x11 = *((uint32*)(ref + 12)); /* H G F E */
81 x12 = *((uint32*)(ref + 16)); /* L K J I */
82
83 x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24 = 0xFF 0xFF 0xFF ~D */
84 x10 = x10 | (x11 << (32 - SHIFT)); /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
85 x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
86 x11 = x11 | (x12 << (32 - SHIFT)); /* ~K ~J ~I ~H */
87
88 x12 = *((uint32*)(blk + 8));
89 x14 = *((uint32*)(blk + 12));
90
91 /* process x11 & x14 */
92 x11 = sad_4pixel(x11, x14, x9);
93
94 /* process x12 & x10 */
95 x10 = sad_4pixel(x10, x12, x9);
96
97 x5 = x5 + x10; /* accumulate low bytes */
98 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
99 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
100 x5 = x5 + x11; /* accumulate low bytes */
101 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
102 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
103
104 /****************/
105 x10 = x5 - (x4 << 8); /* extract low bytes */
106 x10 = x10 + x4; /* add with high bytes */
107 x10 = x10 + (x10 << 16); /* add with lower half word */
108
109 if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
110 {
111 if (--x8)
112 {
113 #if (NUMBER==3)
114 goto LOOP_SAD3;
115 #elif (NUMBER==2)
116 goto LOOP_SAD2;
117 #elif (NUMBER==1)
118 goto LOOP_SAD1;
119 #endif
120 }
121
122 }
123
124 return ((uint32)x10 >> 16);
125 }
126
127 #elif defined(__CC_ARM) /* only work with arm v5 */
128
129 #if (NUMBER==3)
sad_mb_offset3(UChar * ref,UChar * blk,Int lx,Int dmin,int32 x8)130 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
131 #elif (NUMBER==2)
132 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
133 #elif (NUMBER==1)
134 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
135 #endif
136 {
137 int32 x4, x5, x6, x9, x10, x11, x12, x14;
138
139 x9 = 0x80808080; /* const. */
140 x4 = x5 = 0;
141
142 __asm{
143 MVN x6, #0xff0000;
144 BIC ref, ref, #3;
145
146 #if (NUMBER==3)
147 LOOP_SAD3:
148 #elif (NUMBER==2)
149 LOOP_SAD2:
150 #elif (NUMBER==1)
151 LOOP_SAD1:
152 #endif
153 }
154 /****** process 8 pixels ******/
155 x11 = *((int32*)(ref + 12));
156 x12 = *((int32*)(ref + 16));
157 x10 = *((int32*)(ref + 8));
158 x14 = *((int32*)(blk + 12));
159
160 __asm{
161 MVN x10, x10, lsr #SHIFT;
162 BIC x10, x10, x11, lsl #(32-SHIFT);
163 MVN x11, x11, lsr #SHIFT;
164 BIC x11, x11, x12, lsl #(32-SHIFT);
165
166 LDR x12, [blk, #8];
167 }
168
169 /* process x11 & x14 */
170 x11 = sad_4pixelN(x11, x14, x9);
171
172 /* process x12 & x10 */
173 x10 = sad_4pixelN(x10, x12, x9);
174
175 sum_accumulate;
176
177 __asm{
178 /****** process 8 pixels ******/
179 LDR x11, [ref, #4];
180 LDR x12, [ref, #8];
181 LDR x10, [ref], lx ;
182 LDR x14, [blk, #4];
183
184 MVN x10, x10, lsr #SHIFT;
185 BIC x10, x10, x11, lsl #(32-SHIFT);
186 MVN x11, x11, lsr #SHIFT;
187 BIC x11, x11, x12, lsl #(32-SHIFT);
188
189 LDR x12, [blk], #16;
190 }
191
192 /* process x11 & x14 */
193 x11 = sad_4pixelN(x11, x14, x9);
194
195 /* process x12 & x10 */
196 x10 = sad_4pixelN(x10, x12, x9);
197
198 sum_accumulate;
199
200 /****************/
201 x10 = x5 - (x4 << 8); /* extract low bytes */
202 x10 = x10 + x4; /* add with high bytes */
203 x10 = x10 + (x10 << 16); /* add with lower half word */
204
205 __asm{
206 RSBS x11, dmin, x10, lsr #16
207 ADDLSS x8, x8, #INC_X8
208 #if (NUMBER==3)
209 BLS LOOP_SAD3;
210 #elif (NUMBER==2)
211 BLS LOOP_SAD2;
212 #elif (NUMBER==1)
213 BLS LOOP_SAD1;
214 #endif
215 }
216
217 return ((uint32)x10 >> 16);
218 }
219
220 #elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER */
221
222 #if (NUMBER==3)
sad_mb_offset3(UChar * ref,UChar * blk,Int lx,Int dmin)223 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin)
224 #elif (NUMBER==2)
225 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin)
226 #elif (NUMBER==1)
227 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin)
228 #endif
229 {
230 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
231
232 // x5 = (x4<<8) - x4;
233 x4 = x5 = 0;
234 x6 = 0xFFFF00FF;
235 x9 = 0x80808080; /* const. */
236 ref -= NUMBER; /* bic ref, ref, #3 */
237 ref -= lx;
238 x8 = 16;
239
240 #if (NUMBER==3)
241 LOOP_SAD3:
242 #elif (NUMBER==2)
243 LOOP_SAD2:
244 #elif (NUMBER==1)
245 LOOP_SAD1:
246 #endif
247 /****** process 8 pixels ******/
248 x10 = *((uint32*)(ref += lx)); /* D C B A */
249 x11 = *((uint32*)(ref + 4)); /* H G F E */
250 x12 = *((uint32*)(ref + 8)); /* L K J I */
251
252 int32 shift = SHIFT;
253 int32 shift2 = 32 - SHIFT;
254 asm volatile("ldr %3, [%4, #4]\n\t"
255 "mvn %0, %0, lsr %5\n\t"
256 "bic %0, %0, %1, lsl %6\n\t"
257 "mvn %1, %1, lsr %5\n\t"
258 "bic %1, %1, %2, lsl %6\n\t"
259 "ldr %2, [%4, #8]"
260 : "+r"(x10), "+r"(x11), "+r"(x12), "=r"(x14)
261 : "r"(blk), "r"(shift), "r"(shift2));
262
263 /* process x11 & x14 */
264 x11 = sad_4pixel(x11, x14, x9);
265
266 /* process x12 & x10 */
267 x10 = sad_4pixel(x10, x12, x9);
268
269 sum_accumulate;
270
271 /****** process 8 pixels ******/
272 x10 = *((uint32*)(ref + 8)); /* D C B A */
273 x11 = *((uint32*)(ref + 12)); /* H G F E */
274 x12 = *((uint32*)(ref + 16)); /* L K J I */
275
276 asm volatile("ldr %3, [%4, #4]\n\t"
277 "mvn %0, %0, lsr %5\n\t"
278 "bic %0, %0, %1, lsl %6\n\t"
279 "mvn %1, %1, lsr %5\n\t"
280 "bic %1, %1, %2, lsl %6\n\t"
281 "ldr %2, [%4, #8]"
282 : "+r"(x10), "+r"(x11), "+r"(x12), "=r"(x14)
283 : "r"(blk), "r"(shift), "r"(shift2));
284
285 /* process x11 & x14 */
286 x11 = sad_4pixel(x11, x14, x9);
287
288 /* process x12 & x10 */
289 x10 = sad_4pixel(x10, x12, x9);
290
291 sum_accumulate;
292
293 /****************/
294 x10 = x5 - (x4 << 8); /* extract low bytes */
295 x10 = x10 + x4; /* add with high bytes */
296 x10 = x10 + (x10 << 16); /* add with lower half word */
297
298 if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
299 {
300 if (--x8)
301 {
302 #if (NUMBER==3)
303 goto LOOP_SAD3;
304 #elif (NUMBER==2)
305 goto LOOP_SAD2;
306 #elif (NUMBER==1)
307 goto LOOP_SAD1;
308 #endif
309 }
310
311 }
312
313 return ((uint32)x10 >> 16);
314 }
315
316 #endif
317
318