1 /*
2 * Alpha optimized DSP utils
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "libavutil/attributes.h"
23 #include "libavcodec/hpeldsp.h"
24 #include "hpeldsp_alpha.h"
25 #include "asm.h"
26
avg2_no_rnd(uint64_t a,uint64_t b)27 static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
28 {
29 return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
30 }
31
avg2(uint64_t a,uint64_t b)32 static inline uint64_t avg2(uint64_t a, uint64_t b)
33 {
34 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
35 }
36
37 #if 0
38 /* The XY2 routines basically utilize this scheme, but reuse parts in
39 each iteration. */
40 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
41 {
42 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
43 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
44 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
45 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
46 uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
47 + (l2 & BYTE_VEC(0x03))
48 + (l3 & BYTE_VEC(0x03))
49 + (l4 & BYTE_VEC(0x03))
50 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
51 return r1 + r2;
52 }
53 #endif
54
55 #define OP(LOAD, STORE) \
56 do { \
57 STORE(LOAD(pixels), block); \
58 pixels += line_size; \
59 block += line_size; \
60 } while (--h)
61
62 #define OP_X2(LOAD, STORE) \
63 do { \
64 uint64_t pix1, pix2; \
65 \
66 pix1 = LOAD(pixels); \
67 pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
68 STORE(AVG2(pix1, pix2), block); \
69 pixels += line_size; \
70 block += line_size; \
71 } while (--h)
72
73 #define OP_Y2(LOAD, STORE) \
74 do { \
75 uint64_t pix = LOAD(pixels); \
76 do { \
77 uint64_t next_pix; \
78 \
79 pixels += line_size; \
80 next_pix = LOAD(pixels); \
81 STORE(AVG2(pix, next_pix), block); \
82 block += line_size; \
83 pix = next_pix; \
84 } while (--h); \
85 } while (0)
86
87 #define OP_XY2(LOAD, STORE) \
88 do { \
89 uint64_t pix1 = LOAD(pixels); \
90 uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
91 uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \
92 + (pix2 & BYTE_VEC(0x03)); \
93 uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \
94 + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \
95 \
96 do { \
97 uint64_t npix1, npix2; \
98 uint64_t npix_l, npix_h; \
99 uint64_t avg; \
100 \
101 pixels += line_size; \
102 npix1 = LOAD(pixels); \
103 npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \
104 npix_l = (npix1 & BYTE_VEC(0x03)) \
105 + (npix2 & BYTE_VEC(0x03)); \
106 npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \
107 + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \
108 avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
109 + pix_h + npix_h; \
110 STORE(avg, block); \
111 \
112 block += line_size; \
113 pix_l = npix_l; \
114 pix_h = npix_h; \
115 } while (--h); \
116 } while (0)
117
118 #define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \
119 static void OPNAME ## _pixels ## SUFF ## _axp \
120 (uint8_t *restrict block, const uint8_t *restrict pixels, \
121 ptrdiff_t line_size, int h) \
122 { \
123 if ((size_t) pixels & 0x7) { \
124 OPKIND(uldq, STORE); \
125 } else { \
126 OPKIND(ldq, STORE); \
127 } \
128 } \
129 \
130 static void OPNAME ## _pixels16 ## SUFF ## _axp \
131 (uint8_t *restrict block, const uint8_t *restrict pixels, \
132 ptrdiff_t line_size, int h) \
133 { \
134 OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \
135 OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
136 }
137
138 #define PIXOP(OPNAME, STORE) \
139 MAKE_OP(OPNAME, , OP, STORE) \
140 MAKE_OP(OPNAME, _x2, OP_X2, STORE) \
141 MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \
142 MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
143
144 /* Rounding primitives. */
145 #define AVG2 avg2
146 #define AVG4 avg4
147 #define AVG4_ROUNDER BYTE_VEC(0x02)
148 #define STORE(l, b) stq(l, b)
149 PIXOP(put, STORE);
150
151 #undef STORE
152 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
153 PIXOP(avg, STORE);
154
155 /* Not rounding primitives. */
156 #undef AVG2
157 #undef AVG4
158 #undef AVG4_ROUNDER
159 #undef STORE
160 #define AVG2 avg2_no_rnd
161 #define AVG4 avg4_no_rnd
162 #define AVG4_ROUNDER BYTE_VEC(0x01)
163 #define STORE(l, b) stq(l, b)
164 PIXOP(put_no_rnd, STORE);
165
166 #undef STORE
167 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
168 PIXOP(avg_no_rnd, STORE);
169
put_pixels16_axp_asm(uint8_t * block,const uint8_t * pixels,ptrdiff_t line_size,int h)170 static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
171 ptrdiff_t line_size, int h)
172 {
173 put_pixels_axp_asm(block, pixels, line_size, h);
174 put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
175 }
176
ff_hpeldsp_init_alpha(HpelDSPContext * c,int flags)177 av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags)
178 {
179 c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
180 c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
181 c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
182 c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
183
184 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
185 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
186 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
187 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
188
189 c->avg_pixels_tab[0][0] = avg_pixels16_axp;
190 c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
191 c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
192 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
193
194 c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
195 c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
196 c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
197 c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
198
199 c->put_pixels_tab[1][0] = put_pixels_axp_asm;
200 c->put_pixels_tab[1][1] = put_pixels_x2_axp;
201 c->put_pixels_tab[1][2] = put_pixels_y2_axp;
202 c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
203
204 c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
205 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
206 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
207 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
208
209 c->avg_pixels_tab[1][0] = avg_pixels_axp;
210 c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
211 c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
212 c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
213 }
214