• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; *****************************************************************************
2; * Provide SIMD optimizations for add_residual functions for HEVC decoding
3; * Copyright (c) 2014 Pierre-Edouard LEPERE
4; *
5; * This file is part of FFmpeg.
6; *
7; * FFmpeg is free software; you can redistribute it and/or
8; * modify it under the terms of the GNU Lesser General Public
9; * License as published by the Free Software Foundation; either
10; * version 2.1 of the License, or (at your option) any later version.
11; *
12; * FFmpeg is distributed in the hope that it will be useful,
13; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15; * Lesser General Public License for more details.
16; *
17; * You should have received a copy of the GNU Lesser General Public
18; * License along with FFmpeg; if not, write to the Free Software
19; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20; ******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION .text
25
26cextern pw_1023
27%define max_pixels_10 pw_1023
28
29; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
30%macro ADD_RES_MMX_4_8 0
31    mova              m0, [r1]
32    mova              m2, [r1+8]
33
34    movd              m1, [r0]
35    movd              m3, [r0+r2]
36    punpcklbw         m1, m4
37    punpcklbw         m3, m4
38
39    paddsw            m0, m1
40    paddsw            m2, m3
41    packuswb          m0, m4
42    packuswb          m2, m4
43
44    movd            [r0], m0
45    movd         [r0+r2], m2
46%endmacro
47
48
49INIT_MMX mmxext
50; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
51cglobal hevc_add_residual_4_8, 3, 3, 6
52    pxor              m4, m4
53    ADD_RES_MMX_4_8
54    add               r1, 16
55    lea               r0, [r0+r2*2]
56    ADD_RES_MMX_4_8
57    RET
58
59%macro ADD_RES_SSE_8_8 0
60    movq              m0, [r0]
61    movq              m1, [r0+r2]
62    punpcklbw         m0, m4
63    punpcklbw         m1, m4
64    mova              m2, [r1]
65    mova              m3, [r1+16]
66    paddsw            m0, m2
67    paddsw            m1, m3
68    packuswb          m0, m1
69
70    movq              m2, [r0+r2*2]
71    movq              m3, [r0+r3]
72    punpcklbw         m2, m4
73    punpcklbw         m3, m4
74    mova              m6, [r1+32]
75    mova              m7, [r1+48]
76    paddsw            m2, m6
77    paddsw            m3, m7
78    packuswb          m2, m3
79
80    movq            [r0], m0
81    movhps       [r0+r2], m0
82    movq       [r0+r2*2], m2
83    movhps       [r0+r3], m2
84%endmacro
85
86%macro ADD_RES_SSE_16_32_8 3
87    mova              m1, [%2]
88    mova              m2, m1
89    punpcklbw         m1, m0
90    punpckhbw         m2, m0
91    mova             xm5, [r1+%1]
92    mova             xm6, [r1+%1+16]
93%if cpuflag(avx2)
94    vinserti128       m5, m5, [r1+%1+32], 1
95    vinserti128       m6, m6, [r1+%1+48], 1
96%endif
97    paddsw            m1, m5
98    paddsw            m2, m6
99
100    mova              m3, [%3]
101    mova              m4, m3
102    punpcklbw         m3, m0
103    punpckhbw         m4, m0
104    mova             xm5, [r1+%1+mmsize*2]
105    mova             xm6, [r1+%1+mmsize*2+16]
106%if cpuflag(avx2)
107    vinserti128       m5, m5, [r1+%1+96], 1
108    vinserti128       m6, m6, [r1+%1+112], 1
109%endif
110    paddsw            m3, m5
111    paddsw            m4, m6
112
113    packuswb          m1, m2
114    packuswb          m3, m4
115    mova            [%2], m1
116    mova            [%3], m3
117%endmacro
118
119
120%macro TRANSFORM_ADD_8 0
121; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
122cglobal hevc_add_residual_8_8, 3, 4, 8
123    pxor              m4, m4
124    lea               r3, [r2*3]
125    ADD_RES_SSE_8_8
126    add               r1, 64
127    lea               r0, [r0+r2*4]
128    ADD_RES_SSE_8_8
129    RET
130
131; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
132cglobal hevc_add_residual_16_8, 3, 5, 7
133    pxor                m0, m0
134    lea                 r3, [r2*3]
135    mov                r4d, 4
136.loop:
137    ADD_RES_SSE_16_32_8  0, r0,      r0+r2
138    ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
139    add                 r1, 128
140    lea                 r0, [r0+r2*4]
141    dec                r4d
142    jg .loop
143    RET
144
145; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
146cglobal hevc_add_residual_32_8, 3, 5, 7
147    pxor                m0, m0
148    mov                r4d, 16
149.loop:
150    ADD_RES_SSE_16_32_8  0, r0,    r0+16
151    ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
152    add                 r1, 128
153    lea                 r0, [r0+r2*2]
154    dec                r4d
155    jg .loop
156    RET
157%endmacro
158
159INIT_XMM sse2
160TRANSFORM_ADD_8
161INIT_XMM avx
162TRANSFORM_ADD_8
163
164%if HAVE_AVX2_EXTERNAL
165INIT_YMM avx2
166; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
167cglobal hevc_add_residual_32_8, 3, 5, 7
168    pxor                 m0, m0
169    lea                  r3, [r2*3]
170    mov                 r4d, 8
171.loop:
172    ADD_RES_SSE_16_32_8   0, r0,      r0+r2
173    ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
174    add                  r1, 256
175    lea                  r0, [r0+r2*4]
176    dec                 r4d
177    jg .loop
178    RET
179%endif ;HAVE_AVX2_EXTERNAL
180
181%macro ADD_RES_SSE_8_10 4
182    mova              m0, [%4]
183    mova              m1, [%4+16]
184    mova              m2, [%4+32]
185    mova              m3, [%4+48]
186    paddw             m0, [%1+0]
187    paddw             m1, [%1+%2]
188    paddw             m2, [%1+%2*2]
189    paddw             m3, [%1+%3]
190    CLIPW             m0, m4, m5
191    CLIPW             m1, m4, m5
192    CLIPW             m2, m4, m5
193    CLIPW             m3, m4, m5
194    mova          [%1+0], m0
195    mova         [%1+%2], m1
196    mova       [%1+%2*2], m2
197    mova         [%1+%3], m3
198%endmacro
199
200%macro ADD_RES_MMX_4_10 3
201    mova              m0, [%1+0]
202    mova              m1, [%1+%2]
203    paddw             m0, [%3]
204    paddw             m1, [%3+8]
205    CLIPW             m0, m2, m3
206    CLIPW             m1, m2, m3
207    mova          [%1+0], m0
208    mova         [%1+%2], m1
209%endmacro
210
211%macro ADD_RES_SSE_16_10 3
212    mova              m0, [%3]
213    mova              m1, [%3+16]
214    mova              m2, [%3+32]
215    mova              m3, [%3+48]
216    paddw             m0, [%1]
217    paddw             m1, [%1+16]
218    paddw             m2, [%1+%2]
219    paddw             m3, [%1+%2+16]
220    CLIPW             m0, m4, m5
221    CLIPW             m1, m4, m5
222    CLIPW             m2, m4, m5
223    CLIPW             m3, m4, m5
224    mova            [%1], m0
225    mova         [%1+16], m1
226    mova         [%1+%2], m2
227    mova      [%1+%2+16], m3
228%endmacro
229
230%macro ADD_RES_SSE_32_10 2
231    mova              m0, [%2]
232    mova              m1, [%2+16]
233    mova              m2, [%2+32]
234    mova              m3, [%2+48]
235
236    paddw             m0, [%1]
237    paddw             m1, [%1+16]
238    paddw             m2, [%1+32]
239    paddw             m3, [%1+48]
240    CLIPW             m0, m4, m5
241    CLIPW             m1, m4, m5
242    CLIPW             m2, m4, m5
243    CLIPW             m3, m4, m5
244    mova            [%1], m0
245    mova         [%1+16], m1
246    mova         [%1+32], m2
247    mova         [%1+48], m3
248%endmacro
249
250%macro ADD_RES_AVX2_16_10 4
251    mova              m0, [%4]
252    mova              m1, [%4+32]
253    mova              m2, [%4+64]
254    mova              m3, [%4+96]
255
256    paddw             m0, [%1+0]
257    paddw             m1, [%1+%2]
258    paddw             m2, [%1+%2*2]
259    paddw             m3, [%1+%3]
260
261    CLIPW             m0, m4, m5
262    CLIPW             m1, m4, m5
263    CLIPW             m2, m4, m5
264    CLIPW             m3, m4, m5
265    mova          [%1+0], m0
266    mova         [%1+%2], m1
267    mova       [%1+%2*2], m2
268    mova         [%1+%3], m3
269%endmacro
270
271%macro ADD_RES_AVX2_32_10 3
272    mova              m0, [%3]
273    mova              m1, [%3+32]
274    mova              m2, [%3+64]
275    mova              m3, [%3+96]
276
277    paddw             m0, [%1]
278    paddw             m1, [%1+32]
279    paddw             m2, [%1+%2]
280    paddw             m3, [%1+%2+32]
281
282    CLIPW             m0, m4, m5
283    CLIPW             m1, m4, m5
284    CLIPW             m2, m4, m5
285    CLIPW             m3, m4, m5
286    mova            [%1], m0
287    mova         [%1+32], m1
288    mova         [%1+%2], m2
289    mova      [%1+%2+32], m3
290%endmacro
291
292; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
293INIT_MMX mmxext
294cglobal hevc_add_residual_4_10, 3, 3, 6
295    pxor              m2, m2
296    mova              m3, [max_pixels_10]
297    ADD_RES_MMX_4_10  r0, r2, r1
298    add               r1, 16
299    lea               r0, [r0+2*r2]
300    ADD_RES_MMX_4_10  r0, r2, r1
301    RET
302
303INIT_XMM sse2
304cglobal hevc_add_residual_8_10, 3, 4, 6
305    pxor              m4, m4
306    mova              m5, [max_pixels_10]
307    lea               r3, [r2*3]
308
309    ADD_RES_SSE_8_10  r0, r2, r3, r1
310    lea               r0, [r0+r2*4]
311    add               r1, 64
312    ADD_RES_SSE_8_10  r0, r2, r3, r1
313    RET
314
315cglobal hevc_add_residual_16_10, 3, 5, 6
316    pxor              m4, m4
317    mova              m5, [max_pixels_10]
318
319    mov              r4d, 8
320.loop:
321    ADD_RES_SSE_16_10 r0, r2, r1
322    lea               r0, [r0+r2*2]
323    add               r1, 64
324    dec              r4d
325    jg .loop
326    RET
327
328cglobal hevc_add_residual_32_10, 3, 5, 6
329    pxor              m4, m4
330    mova              m5, [max_pixels_10]
331
332    mov              r4d, 32
333.loop:
334    ADD_RES_SSE_32_10 r0, r1
335    lea               r0, [r0+r2]
336    add               r1, 64
337    dec              r4d
338    jg .loop
339    RET
340
341%if HAVE_AVX2_EXTERNAL
342INIT_YMM avx2
343cglobal hevc_add_residual_16_10, 3, 5, 6
344    pxor               m4, m4
345    mova               m5, [max_pixels_10]
346    lea                r3, [r2*3]
347
348    mov               r4d, 4
349.loop:
350    ADD_RES_AVX2_16_10 r0, r2, r3, r1
351    lea                r0, [r0+r2*4]
352    add                r1, 128
353    dec               r4d
354    jg .loop
355    RET
356
357cglobal hevc_add_residual_32_10, 3, 5, 6
358    pxor               m4, m4
359    mova               m5, [max_pixels_10]
360
361    mov               r4d, 16
362.loop:
363    ADD_RES_AVX2_32_10 r0, r2, r1
364    lea                r0, [r0+r2*2]
365    add                r1, 128
366    dec               r4d
367    jg .loop
368    RET
369%endif ;HAVE_AVX2_EXTERNAL
370