• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Alpha optimized DSP utils
3  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavcodec/me_cmp.h"
24 #include "asm.h"
25 
26 int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
27 
avg2(uint64_t a,uint64_t b)28 static inline uint64_t avg2(uint64_t a, uint64_t b)
29 {
30     return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
31 }
32 
avg4(uint64_t l1,uint64_t l2,uint64_t l3,uint64_t l4)33 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
34 {
35     uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
36                 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
37                 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
38                 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
39     uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
40                     + (l2 & BYTE_VEC(0x03))
41                     + (l3 & BYTE_VEC(0x03))
42                     + (l4 & BYTE_VEC(0x03))
43                     + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
44     return r1 + r2;
45 }
46 
pix_abs8x8_mvi(void * v,uint8_t * pix1,uint8_t * pix2,int line_size,int h)47 static int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
48 {
49     int result = 0;
50 
51     if ((size_t) pix2 & 0x7) {
52         /* works only when pix2 is actually unaligned */
53         do {                    /* do 8 pixel a time */
54             uint64_t p1, p2;
55 
56             p1  = ldq(pix1);
57             p2  = uldq(pix2);
58             result += perr(p1, p2);
59 
60             pix1 += line_size;
61             pix2 += line_size;
62         } while (--h);
63     } else {
64         do {
65             uint64_t p1, p2;
66 
67             p1 = ldq(pix1);
68             p2 = ldq(pix2);
69             result += perr(p1, p2);
70 
71             pix1 += line_size;
72             pix2 += line_size;
73         } while (--h);
74     }
75 
76     return result;
77 }
78 
79 #if 0                           /* now done in assembly */
80 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
81 {
82     int result = 0;
83     int h = 16;
84 
85     if ((size_t) pix2 & 0x7) {
86         /* works only when pix2 is actually unaligned */
87         do {                    /* do 16 pixel a time */
88             uint64_t p1_l, p1_r, p2_l, p2_r;
89             uint64_t t;
90 
91             p1_l  = ldq(pix1);
92             p1_r  = ldq(pix1 + 8);
93             t     = ldq_u(pix2 + 8);
94             p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
95             p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
96             pix1 += line_size;
97             pix2 += line_size;
98 
99             result += perr(p1_l, p2_l)
100                     + perr(p1_r, p2_r);
101         } while (--h);
102     } else {
103         do {
104             uint64_t p1_l, p1_r, p2_l, p2_r;
105 
106             p1_l = ldq(pix1);
107             p1_r = ldq(pix1 + 8);
108             p2_l = ldq(pix2);
109             p2_r = ldq(pix2 + 8);
110             pix1 += line_size;
111             pix2 += line_size;
112 
113             result += perr(p1_l, p2_l)
114                     + perr(p1_r, p2_r);
115         } while (--h);
116     }
117 
118     return result;
119 }
120 #endif
121 
pix_abs16x16_x2_mvi(void * v,uint8_t * pix1,uint8_t * pix2,int line_size,int h)122 static int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
123 {
124     int result = 0;
125     uint64_t disalign = (size_t) pix2 & 0x7;
126 
127     switch (disalign) {
128     case 0:
129         do {
130             uint64_t p1_l, p1_r, p2_l, p2_r;
131             uint64_t l, r;
132 
133             p1_l = ldq(pix1);
134             p1_r = ldq(pix1 + 8);
135             l    = ldq(pix2);
136             r    = ldq(pix2 + 8);
137             p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
138             p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
139             pix1 += line_size;
140             pix2 += line_size;
141 
142             result += perr(p1_l, p2_l)
143                     + perr(p1_r, p2_r);
144         } while (--h);
145         break;
146     case 7:
147         /* |.......l|lllllllr|rrrrrrr*|
148            This case is special because disalign1 would be 8, which
149            gets treated as 0 by extqh.  At least it is a bit faster
150            that way :)  */
151         do {
152             uint64_t p1_l, p1_r, p2_l, p2_r;
153             uint64_t l, m, r;
154 
155             p1_l = ldq(pix1);
156             p1_r = ldq(pix1 + 8);
157             l     = ldq_u(pix2);
158             m     = ldq_u(pix2 + 8);
159             r     = ldq_u(pix2 + 16);
160             p2_l  = avg2(extql(l, disalign) | extqh(m, disalign), m);
161             p2_r  = avg2(extql(m, disalign) | extqh(r, disalign), r);
162             pix1 += line_size;
163             pix2 += line_size;
164 
165             result += perr(p1_l, p2_l)
166                     + perr(p1_r, p2_r);
167         } while (--h);
168         break;
169     default:
170         do {
171             uint64_t disalign1 = disalign + 1;
172             uint64_t p1_l, p1_r, p2_l, p2_r;
173             uint64_t l, m, r;
174 
175             p1_l  = ldq(pix1);
176             p1_r  = ldq(pix1 + 8);
177             l     = ldq_u(pix2);
178             m     = ldq_u(pix2 + 8);
179             r     = ldq_u(pix2 + 16);
180             p2_l  = avg2(extql(l, disalign) | extqh(m, disalign),
181                          extql(l, disalign1) | extqh(m, disalign1));
182             p2_r  = avg2(extql(m, disalign) | extqh(r, disalign),
183                          extql(m, disalign1) | extqh(r, disalign1));
184             pix1 += line_size;
185             pix2 += line_size;
186 
187             result += perr(p1_l, p2_l)
188                     + perr(p1_r, p2_r);
189         } while (--h);
190         break;
191     }
192     return result;
193 }
194 
pix_abs16x16_y2_mvi(void * v,uint8_t * pix1,uint8_t * pix2,int line_size,int h)195 static int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
196 {
197     int result = 0;
198 
199     if ((size_t) pix2 & 0x7) {
200         uint64_t t, p2_l, p2_r;
201         t     = ldq_u(pix2 + 8);
202         p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
203         p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
204 
205         do {
206             uint64_t p1_l, p1_r, np2_l, np2_r;
207             uint64_t t;
208 
209             p1_l  = ldq(pix1);
210             p1_r  = ldq(pix1 + 8);
211             pix2 += line_size;
212             t     = ldq_u(pix2 + 8);
213             np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
214             np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
215 
216             result += perr(p1_l, avg2(p2_l, np2_l))
217                     + perr(p1_r, avg2(p2_r, np2_r));
218 
219             pix1 += line_size;
220             p2_l  = np2_l;
221             p2_r  = np2_r;
222 
223         } while (--h);
224     } else {
225         uint64_t p2_l, p2_r;
226         p2_l = ldq(pix2);
227         p2_r = ldq(pix2 + 8);
228         do {
229             uint64_t p1_l, p1_r, np2_l, np2_r;
230 
231             p1_l = ldq(pix1);
232             p1_r = ldq(pix1 + 8);
233             pix2 += line_size;
234             np2_l = ldq(pix2);
235             np2_r = ldq(pix2 + 8);
236 
237             result += perr(p1_l, avg2(p2_l, np2_l))
238                     + perr(p1_r, avg2(p2_r, np2_r));
239 
240             pix1 += line_size;
241             p2_l  = np2_l;
242             p2_r  = np2_r;
243         } while (--h);
244     }
245     return result;
246 }
247 
pix_abs16x16_xy2_mvi(void * v,uint8_t * pix1,uint8_t * pix2,int line_size,int h)248 static int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
249 {
250     int result = 0;
251 
252     uint64_t p1_l, p1_r;
253     uint64_t p2_l, p2_r, p2_x;
254 
255     p1_l = ldq(pix1);
256     p1_r = ldq(pix1 + 8);
257 
258     if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
259         p2_l = uldq(pix2);
260         p2_r = uldq(pix2 + 8);
261         p2_x = (uint64_t) pix2[16] << 56;
262     } else {
263         p2_l = ldq(pix2);
264         p2_r = ldq(pix2 + 8);
265         p2_x = ldq(pix2 + 16) << 56;
266     }
267 
268     do {
269         uint64_t np1_l, np1_r;
270         uint64_t np2_l, np2_r, np2_x;
271 
272         pix1 += line_size;
273         pix2 += line_size;
274 
275         np1_l = ldq(pix1);
276         np1_r = ldq(pix1 + 8);
277 
278         if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
279             np2_l = uldq(pix2);
280             np2_r = uldq(pix2 + 8);
281             np2_x = (uint64_t) pix2[16] << 56;
282         } else {
283             np2_l = ldq(pix2);
284             np2_r = ldq(pix2 + 8);
285             np2_x = ldq(pix2 + 16) << 56;
286         }
287 
288         result += perr(p1_l,
289                        avg4( p2_l, ( p2_l >> 8) | ((uint64_t)  p2_r << 56),
290                             np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
291                 + perr(p1_r,
292                        avg4( p2_r, ( p2_r >> 8) | ((uint64_t)  p2_x),
293                             np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
294 
295         p1_l = np1_l;
296         p1_r = np1_r;
297         p2_l = np2_l;
298         p2_r = np2_r;
299         p2_x = np2_x;
300     } while (--h);
301 
302     return result;
303 }
304 
ff_me_cmp_init_alpha(MECmpContext * c,AVCodecContext * avctx)305 av_cold void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx)
306 {
307     /* amask clears all bits that correspond to present features.  */
308     if (amask(AMASK_MVI) == 0) {
309         c->sad[0]           = pix_abs16x16_mvi_asm;
310         c->sad[1]           = pix_abs8x8_mvi;
311         c->pix_abs[0][0]    = pix_abs16x16_mvi_asm;
312         c->pix_abs[1][0]    = pix_abs8x8_mvi;
313         c->pix_abs[0][1]    = pix_abs16x16_x2_mvi;
314         c->pix_abs[0][2]    = pix_abs16x16_y2_mvi;
315         c->pix_abs[0][3]    = pix_abs16x16_xy2_mvi;
316     }
317 }
318