• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* x86-optimized functions for the CFHD encoder
3;* Copyright (c) 2021 Paul B Mahol
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26pw_p1_n1:  dw  1, -1, 1, -1, 1, -1, 1, -1
27pw_n1_p1:  dw  -1, 1, -1, 1, -1, 1, -1, 1
28pw_p5_n11: dw  5, -11, 5, -11, 5, -11, 5, -11
29pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
30pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
31pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
32pd_4:  times 4 dd  4
33pw_n4: times 8 dw -4
34cextern pw_m1
35cextern pw_1
36cextern pw_4
37
38SECTION .text
39
40%if ARCH_X86_64
41INIT_XMM sse2
42cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp
43    shl  istrideq, 1
44    shl   lwidthq, 1
45    shl   hwidthq, 1
46    mova       m7, [pd_4]
47    mova       m8, [pw_1]
48    mova       m9, [pw_m1]
49    mova       m10,[pw_p1_n1]
50    movsxdifnidn yq, yd
51    movsxdifnidn widthq, widthd
52    neg        yq
53.looph:
54    movsx          xq, word [inputq]
55
56    movsx       tempq, word [inputq + 2]
57    add         tempq, xq
58
59    movd          xm0, tempd
60    packssdw       m0, m0
61    movd        tempd, m0
62    mov   word [lowq], tempw
63
64    movsx          xq, word [inputq]
65    imul           xq, 5
66    movsx       tempq, word [inputq + 2]
67    imul        tempq, -11
68    add         tempq, xq
69
70    movsx          xq, word [inputq + 4]
71    imul           xq, 4
72    add         tempq, xq
73
74    movsx          xq, word [inputq + 6]
75    imul           xq, 4
76    add         tempq, xq
77
78    movsx          xq, word [inputq + 8]
79    imul           xq, -1
80    add         tempq, xq
81
82    movsx          xq, word [inputq + 10]
83    imul           xq, -1
84    add         tempq, xq
85
86    add         tempq, 4
87    sar         tempq, 3
88
89    movd          xm0, tempd
90    packssdw       m0, m0
91    movd        tempd, m0
92    mov  word [highq], tempw
93
94    mov            xq, 2
95
96.loopw:
97    movu           m0, [inputq + xq * 2]
98    movu           m1, [inputq + xq * 2 + mmsize]
99
100    pmaddwd        m0, m8
101    pmaddwd        m1, m8
102
103    packssdw       m0, m1
104    movu    [lowq+xq], m0
105
106    movu           m2, [inputq + xq * 2 - 4]
107    movu           m3, [inputq + xq * 2 - 4 + mmsize]
108
109    pmaddwd        m2, m9
110    pmaddwd        m3, m9
111
112    movu           m0, [inputq + xq * 2 + 4]
113    movu           m1, [inputq + xq * 2 + 4 + mmsize]
114
115    pmaddwd        m0, m8
116    pmaddwd        m1, m8
117
118    paddd          m0, m2
119    paddd          m1, m3
120
121    paddd          m0, m7
122    paddd          m1, m7
123
124    psrad          m0, 3
125    psrad          m1, 3
126
127    movu           m5, [inputq + xq * 2 + 0]
128    movu           m6, [inputq + xq * 2 + mmsize]
129
130    pmaddwd        m5, m10
131    pmaddwd        m6, m10
132
133    paddd          m0, m5
134    paddd          m1, m6
135
136    packssdw       m0, m1
137    movu   [highq+xq], m0
138
139    add            xq, mmsize
140    cmp            xq, widthq
141    jl .loopw
142
143    add          lowq, widthq
144    add         highq, widthq
145    lea        inputq, [inputq + widthq * 2]
146
147    movsx          xq, word [inputq - 4]
148    movsx       tempq, word [inputq - 2]
149    add         tempq, xq
150
151    movd          xm0, tempd
152    packssdw       m0, m0
153    movd        tempd, m0
154    mov word [lowq-2], tempw
155
156    movsx       tempq, word [inputq - 4]
157    imul        tempq, 11
158    movsx          xq, word [inputq - 2]
159    imul           xq, -5
160    add         tempq, xq
161
162    movsx          xq, word [inputq - 6]
163    imul           xq, -4
164    add         tempq, xq
165
166    movsx          xq, word [inputq - 8]
167    imul           xq, -4
168    add         tempq, xq
169
170    movsx          xq, word [inputq - 10]
171    add         tempq, xq
172
173    movsx          xq, word [inputq - 12]
174    add         tempq, xq
175
176    add         tempq, 4
177    sar         tempq, 3
178
179    movd          xm0, tempd
180    packssdw       m0, m0
181    movd        tempd, m0
182    mov word [highq-2], tempw
183
184    sub        inputq, widthq
185    sub        inputq, widthq
186    sub         highq, widthq
187    sub          lowq, widthq
188
189    add          lowq, lwidthq
190    add         highq, hwidthq
191    add        inputq, istrideq
192    add            yq, 1
193    jl .looph
194
195    RET
196%endif
197
198%if ARCH_X86_64
199INIT_XMM sse2
200cglobal cfhdenc_vert_filter, 8, 11, 14, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos
201    shl  istrideq, 1
202
203    shl    widthd, 1
204    sub   heightd, 2
205
206    xor        xq, xq
207
208    mova       m7, [pd_4]
209    mova       m8, [pw_1]
210    mova       m9, [pw_m1]
211    mova       m10,[pw_p1_n1]
212    mova       m11,[pw_n1_p1]
213    mova       m12,[pw_4]
214    mova       m13,[pw_n4]
215.loopw:
216    mov        yq, 2
217
218    mov      posq, xq
219    movu       m0, [inputq + posq]
220    add      posq, istrideq
221    movu       m1, [inputq + posq]
222
223    paddsw     m0, m1
224
225    movu    [lowq + xq], m0
226
227    mov      posq, xq
228
229    movu       m0, [inputq + posq]
230    add      posq, istrideq
231    movu       m1, [inputq + posq]
232    add      posq, istrideq
233    movu       m2, [inputq + posq]
234    add      posq, istrideq
235    movu       m3, [inputq + posq]
236    add      posq, istrideq
237    movu       m4, [inputq + posq]
238    add      posq, istrideq
239    movu       m5, [inputq + posq]
240
241    mova       m6, m0
242    punpcklwd  m0, m1
243    punpckhwd  m1, m6
244
245    mova       m6, m2
246    punpcklwd  m2, m3
247    punpckhwd  m3, m6
248
249    mova       m6, m4
250    punpcklwd  m4, m5
251    punpckhwd  m5, m6
252
253    pmaddwd    m0, [pw_p5_n11]
254    pmaddwd    m1, [pw_n11_p5]
255    pmaddwd    m2, m12
256    pmaddwd    m3, m12
257    pmaddwd    m4, m9
258    pmaddwd    m5, m9
259
260    paddd      m0, m2
261    paddd      m1, m3
262    paddd      m0, m4
263    paddd      m1, m5
264
265    paddd      m0, m7
266    paddd      m1, m7
267
268    psrad      m0, 3
269    psrad      m1, 3
270    packssdw   m0, m1
271
272    movu   [highq + xq], m0
273
274.looph:
275
276    mov      posq, istrideq
277    imul     posq, yq
278    add      posq, xq
279
280    movu       m0, [inputq + posq]
281
282    add      posq, istrideq
283    movu       m1, [inputq + posq]
284
285    paddsw     m0, m1
286
287    mov      posq, lwidthq
288    imul     posq, yq
289    add      posq, xq
290
291    movu    [lowq + posq], m0
292
293    add        yq, -2
294
295    mov      posq, istrideq
296    imul     posq, yq
297    add      posq, xq
298
299    movu       m0, [inputq + posq]
300    add      posq, istrideq
301    movu       m1, [inputq + posq]
302    add      posq, istrideq
303    movu       m2, [inputq + posq]
304    add      posq, istrideq
305    movu       m3, [inputq + posq]
306    add      posq, istrideq
307    movu       m4, [inputq + posq]
308    add      posq, istrideq
309    movu       m5, [inputq + posq]
310
311    add        yq, 2
312
313    mova       m6, m0
314    punpcklwd  m0, m1
315    punpckhwd  m1, m6
316
317    mova       m6, m2
318    punpcklwd  m2, m3
319    punpckhwd  m3, m6
320
321    mova       m6, m4
322    punpcklwd  m4, m5
323    punpckhwd  m5, m6
324
325    pmaddwd    m0, m9
326    pmaddwd    m1, m9
327    pmaddwd    m2, m10
328    pmaddwd    m3, m11
329    pmaddwd    m4, m8
330    pmaddwd    m5, m8
331
332    paddd      m0, m4
333    paddd      m1, m5
334
335    paddd      m0, m7
336    paddd      m1, m7
337
338    psrad      m0, 3
339    psrad      m1, 3
340    paddd      m0, m2
341    paddd      m1, m3
342    packssdw   m0, m1
343
344    mov      posq, hwidthq
345    imul     posq, yq
346    add      posq, xq
347
348    movu   [highq + posq], m0
349
350    add        yq, 2
351    cmp        yq, heightq
352    jl .looph
353
354    mov      posq, istrideq
355    imul     posq, yq
356    add      posq, xq
357
358    movu       m0, [inputq + posq]
359    add      posq, istrideq
360    movu       m1, [inputq + posq]
361
362    paddsw     m0, m1
363
364    mov      posq, lwidthq
365    imul     posq, yq
366    add      posq, xq
367
368    movu    [lowq + posq], m0
369
370    sub        yq, 4
371
372    mov      posq, istrideq
373    imul     posq, yq
374    add      posq, xq
375
376    movu       m0, [inputq + posq]
377    add      posq, istrideq
378    movu       m1, [inputq + posq]
379    add      posq, istrideq
380    movu       m2, [inputq + posq]
381    add      posq, istrideq
382    movu       m3, [inputq + posq]
383    add      posq, istrideq
384    movu       m4, [inputq + posq]
385    add      posq, istrideq
386    movu       m5, [inputq + posq]
387
388    add        yq, 4
389
390    mova       m6, m0
391    punpcklwd  m0, m1
392    punpckhwd  m1, m6
393
394    mova       m6, m2
395    punpcklwd  m2, m3
396    punpckhwd  m3, m6
397
398    mova       m6, m4
399    punpcklwd  m4, m5
400    punpckhwd  m5, m6
401
402    pmaddwd    m0, m8
403    pmaddwd    m1, m8
404    pmaddwd    m2, m13
405    pmaddwd    m3, m13
406    pmaddwd    m4, [pw_p11_n5]
407    pmaddwd    m5, [pw_n5_p11]
408
409    paddd      m4, m2
410    paddd      m5, m3
411
412    paddd      m4, m0
413    paddd      m5, m1
414
415    paddd      m4, m7
416    paddd      m5, m7
417
418    psrad      m4, 3
419    psrad      m5, 3
420    packssdw   m4, m5
421
422    mov      posq, hwidthq
423    imul     posq, yq
424    add      posq, xq
425
426    movu   [highq + posq], m4
427
428    add        xq, mmsize
429    cmp        xq, widthq
430    jl .loopw
431    RET
432%endif
433