• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; Copyright © 2018, VideoLabs
4; All rights reserved.
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions are met:
8;
9; 1. Redistributions of source code must retain the above copyright notice, this
10;    list of conditions and the following disclaimer.
11;
12; 2. Redistributions in binary form must reproduce the above copyright notice,
13;    this list of conditions and the following disclaimer in the documentation
14;    and/or other materials provided with the distribution.
15;
16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27%include "config.asm"
28%include "ext/x86/x86inc.asm"
29
30SECTION_RODATA 16
31
32; dav1d_obmc_masks[] with 64-x interleaved
33obmc_masks: db  0,  0,  0,  0
34            ; 2 @4
35            db 45, 19, 64,  0
36            ; 4 @8
37            db 39, 25, 50, 14, 59,  5, 64,  0
38            ; 8 @16
39            db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
40            ; 16 @32
41            db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
42            db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
43            ; 32 @64
44            db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
45            db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
46            db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
47
48warp_8x8_shufA: db 0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
49warp_8x8_shufB: db 4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
50warp_8x8_shufC: db 2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
51warp_8x8_shufD: db 6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
52blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
53subpel_h_shuf4: db 0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
54                db 2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
55subpel_h_shufA: db 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
56subpel_h_shufB: db 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
57subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
58subpel_s_shuf2: db 0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
59subpel_s_shuf8: db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
60bilin_h_shuf4:  db 0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
61bilin_h_shuf8:  db 0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
62unpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
63rescale_mul:    dd 0,  1,  2,  3
64resize_shuf:    db 0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
65
66wm_420_sign:    times 4 dw 258
67                times 4 dw 257
68wm_422_sign:    times 8 db 128
69                times 8 db 127
70
71pb_8x0_8x8: times 8 db 0
72            times 8 db 8
73bdct_lb_dw: times 4 db 0
74            times 4 db 4
75            times 4 db 8
76            times 4 db 12
77
78pb_64:    times 16 db 64
79pw_m256:  times 8 dw -256
80pw_1:     times 8 dw 1
81pw_2:     times 8 dw 2
82pw_8:     times 8 dw 8
83pw_15:    times 8 dw 15
84pw_26:    times 8 dw 26
85pw_34:    times 8 dw 34
86pw_512:   times 8 dw 512
87pw_1024:  times 8 dw 1024
88pw_2048:  times 8 dw 2048
89pw_6903:  times 8 dw 6903
90pw_8192:  times 8 dw 8192
91pd_32:    times 4 dd 32
92pd_63:    times 4 dd 63
93pd_512:   times 4 dd 512
94pd_16384: times 4 dd 16484
95pd_32768: times 4 dd 32768
96pd_262144:times 4 dd 262144
97pd_0x3ff: times 4 dd 0x3ff
98pd_0x4000:times 4 dd 0x4000
99pq_0x40000000: times 2 dq 0x40000000
100
101const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage
102    ; [-1, 0)
103    db 0, 127,   0, 0,   0,   1, 0, 0, 0, 127,   0, 0,  -1,   2, 0, 0
104    db 1, 127,  -1, 0,  -3,   4, 0, 0, 1, 126,  -2, 0,  -4,   6, 1, 0
105    db 1, 126,  -3, 0,  -5,   8, 1, 0, 1, 125,  -4, 0,  -6,  11, 1, 0
106    db 1, 124,  -4, 0,  -7,  13, 1, 0, 2, 123,  -5, 0,  -8,  15, 1, 0
107    db 2, 122,  -6, 0,  -9,  18, 1, 0, 2, 121,  -6, 0, -10,  20, 1, 0
108    db 2, 120,  -7, 0, -11,  22, 2, 0, 2, 119,  -8, 0, -12,  25, 2, 0
109    db 3, 117,  -8, 0, -13,  27, 2, 0, 3, 116,  -9, 0, -13,  29, 2, 0
110    db 3, 114, -10, 0, -14,  32, 3, 0, 3, 113, -10, 0, -15,  35, 2, 0
111    db 3, 111, -11, 0, -15,  37, 3, 0, 3, 109, -11, 0, -16,  40, 3, 0
112    db 3, 108, -12, 0, -16,  42, 3, 0, 4, 106, -13, 0, -17,  45, 3, 0
113    db 4, 104, -13, 0, -17,  47, 3, 0, 4, 102, -14, 0, -17,  50, 3, 0
114    db 4, 100, -14, 0, -17,  52, 3, 0, 4,  98, -15, 0, -18,  55, 4, 0
115    db 4,  96, -15, 0, -18,  58, 3, 0, 4,  94, -16, 0, -18,  60, 4, 0
116    db 4,  91, -16, 0, -18,  63, 4, 0, 4,  89, -16, 0, -18,  65, 4, 0
117    db 4,  87, -17, 0, -18,  68, 4, 0, 4,  85, -17, 0, -18,  70, 4, 0
118    db 4,  82, -17, 0, -18,  73, 4, 0, 4,  80, -17, 0, -18,  75, 4, 0
119    db 4,  78, -18, 0, -18,  78, 4, 0, 4,  75, -18, 0, -17,  80, 4, 0
120    db 4,  73, -18, 0, -17,  82, 4, 0, 4,  70, -18, 0, -17,  85, 4, 0
121    db 4,  68, -18, 0, -17,  87, 4, 0, 4,  65, -18, 0, -16,  89, 4, 0
122    db 4,  63, -18, 0, -16,  91, 4, 0, 4,  60, -18, 0, -16,  94, 4, 0
123    db 3,  58, -18, 0, -15,  96, 4, 0, 4,  55, -18, 0, -15,  98, 4, 0
124    db 3,  52, -17, 0, -14, 100, 4, 0, 3,  50, -17, 0, -14, 102, 4, 0
125    db 3,  47, -17, 0, -13, 104, 4, 0, 3,  45, -17, 0, -13, 106, 4, 0
126    db 3,  42, -16, 0, -12, 108, 3, 0, 3,  40, -16, 0, -11, 109, 3, 0
127    db 3,  37, -15, 0, -11, 111, 3, 0, 2,  35, -15, 0, -10, 113, 3, 0
128    db 3,  32, -14, 0, -10, 114, 3, 0, 2,  29, -13, 0,  -9, 116, 3, 0
129    db 2,  27, -13, 0,  -8, 117, 3, 0, 2,  25, -12, 0,  -8, 119, 2, 0
130    db 2,  22, -11, 0,  -7, 120, 2, 0, 1,  20, -10, 0,  -6, 121, 2, 0
131    db 1,  18,  -9, 0,  -6, 122, 2, 0, 1,  15,  -8, 0,  -5, 123, 2, 0
132    db 1,  13,  -7, 0,  -4, 124, 1, 0, 1,  11,  -6, 0,  -4, 125, 1, 0
133    db 1,   8,  -5, 0,  -3, 126, 1, 0, 1,   6,  -4, 0,  -2, 126, 1, 0
134    db 0,   4,  -3, 0,  -1, 127, 1, 0, 0,   2,  -1, 0,   0, 127, 0, 0
135    ; [0, 1)
136    db  0,   0,   1, 0, 0, 127,   0,  0,  0,  -1,   2, 0, 0, 127,   0,  0
137    db  0,  -3,   4, 1, 1, 127,  -2,  0,  0,  -5,   6, 1, 1, 127,  -2,  0
138    db  0,  -6,   8, 1, 2, 126,  -3,  0, -1,  -7,  11, 2, 2, 126,  -4, -1
139    db -1,  -8,  13, 2, 3, 125,  -5, -1, -1, -10,  16, 3, 3, 124,  -6, -1
140    db -1, -11,  18, 3, 4, 123,  -7, -1, -1, -12,  20, 3, 4, 122,  -7, -1
141    db -1, -13,  23, 3, 4, 121,  -8, -1, -2, -14,  25, 4, 5, 120,  -9, -1
142    db -1, -15,  27, 4, 5, 119, -10, -1, -1, -16,  30, 4, 5, 118, -11, -1
143    db -2, -17,  33, 5, 6, 116, -12, -1, -2, -17,  35, 5, 6, 114, -12, -1
144    db -2, -18,  38, 5, 6, 113, -13, -1, -2, -19,  41, 6, 7, 111, -14, -2
145    db -2, -19,  43, 6, 7, 110, -15, -2, -2, -20,  46, 6, 7, 108, -15, -2
146    db -2, -20,  49, 6, 7, 106, -16, -2, -2, -21,  51, 7, 7, 104, -16, -2
147    db -2, -21,  54, 7, 7, 102, -17, -2, -2, -21,  56, 7, 8, 100, -18, -2
148    db -2, -22,  59, 7, 8,  98, -18, -2, -2, -22,  62, 7, 8,  96, -19, -2
149    db -2, -22,  64, 7, 8,  94, -19, -2, -2, -22,  67, 8, 8,  91, -20, -2
150    db -2, -22,  69, 8, 8,  89, -20, -2, -2, -22,  72, 8, 8,  87, -21, -2
151    db -2, -21,  74, 8, 8,  84, -21, -2, -2, -22,  77, 8, 8,  82, -21, -2
152    db -2, -21,  79, 8, 8,  79, -21, -2, -2, -21,  82, 8, 8,  77, -22, -2
153    db -2, -21,  84, 8, 8,  74, -21, -2, -2, -21,  87, 8, 8,  72, -22, -2
154    db -2, -20,  89, 8, 8,  69, -22, -2, -2, -20,  91, 8, 8,  67, -22, -2
155    db -2, -19,  94, 8, 7,  64, -22, -2, -2, -19,  96, 8, 7,  62, -22, -2
156    db -2, -18,  98, 8, 7,  59, -22, -2, -2, -18, 100, 8, 7,  56, -21, -2
157    db -2, -17, 102, 7, 7,  54, -21, -2, -2, -16, 104, 7, 7,  51, -21, -2
158    db -2, -16, 106, 7, 6,  49, -20, -2, -2, -15, 108, 7, 6,  46, -20, -2
159    db -2, -15, 110, 7, 6,  43, -19, -2, -2, -14, 111, 7, 6,  41, -19, -2
160    db -1, -13, 113, 6, 5,  38, -18, -2, -1, -12, 114, 6, 5,  35, -17, -2
161    db -1, -12, 116, 6, 5,  33, -17, -2, -1, -11, 118, 5, 4,  30, -16, -1
162    db -1, -10, 119, 5, 4,  27, -15, -1, -1,  -9, 120, 5, 4,  25, -14, -2
163    db -1,  -8, 121, 4, 3,  23, -13, -1, -1,  -7, 122, 4, 3,  20, -12, -1
164    db -1,  -7, 123, 4, 3,  18, -11, -1, -1,  -6, 124, 3, 3,  16, -10, -1
165    db -1,  -5, 125, 3, 2,  13,  -8, -1, -1,  -4, 126, 2, 2,  11,  -7, -1
166    db  0,  -3, 126, 2, 1,   8,  -6,  0,  0,  -2, 127, 1, 1,   6,  -5,  0
167    db  0,  -2, 127, 1, 1,   4,  -3,  0,  0,   0, 127, 0, 0,   2,  -1,  0
168    ; [1, 2)
169    db 0, 0, 127,   0, 0,   1,   0, 0, 0, 0, 127,   0, 0,  -1,   2, 0
170    db 0, 1, 127,  -1, 0,  -3,   4, 0, 0, 1, 126,  -2, 0,  -4,   6, 1
171    db 0, 1, 126,  -3, 0,  -5,   8, 1, 0, 1, 125,  -4, 0,  -6,  11, 1
172    db 0, 1, 124,  -4, 0,  -7,  13, 1, 0, 2, 123,  -5, 0,  -8,  15, 1
173    db 0, 2, 122,  -6, 0,  -9,  18, 1, 0, 2, 121,  -6, 0, -10,  20, 1
174    db 0, 2, 120,  -7, 0, -11,  22, 2, 0, 2, 119,  -8, 0, -12,  25, 2
175    db 0, 3, 117,  -8, 0, -13,  27, 2, 0, 3, 116,  -9, 0, -13,  29, 2
176    db 0, 3, 114, -10, 0, -14,  32, 3, 0, 3, 113, -10, 0, -15,  35, 2
177    db 0, 3, 111, -11, 0, -15,  37, 3, 0, 3, 109, -11, 0, -16,  40, 3
178    db 0, 3, 108, -12, 0, -16,  42, 3, 0, 4, 106, -13, 0, -17,  45, 3
179    db 0, 4, 104, -13, 0, -17,  47, 3, 0, 4, 102, -14, 0, -17,  50, 3
180    db 0, 4, 100, -14, 0, -17,  52, 3, 0, 4,  98, -15, 0, -18,  55, 4
181    db 0, 4,  96, -15, 0, -18,  58, 3, 0, 4,  94, -16, 0, -18,  60, 4
182    db 0, 4,  91, -16, 0, -18,  63, 4, 0, 4,  89, -16, 0, -18,  65, 4
183    db 0, 4,  87, -17, 0, -18,  68, 4, 0, 4,  85, -17, 0, -18,  70, 4
184    db 0, 4,  82, -17, 0, -18,  73, 4, 0, 4,  80, -17, 0, -18,  75, 4
185    db 0, 4,  78, -18, 0, -18,  78, 4, 0, 4,  75, -18, 0, -17,  80, 4
186    db 0, 4,  73, -18, 0, -17,  82, 4, 0, 4,  70, -18, 0, -17,  85, 4
187    db 0, 4,  68, -18, 0, -17,  87, 4, 0, 4,  65, -18, 0, -16,  89, 4
188    db 0, 4,  63, -18, 0, -16,  91, 4, 0, 4,  60, -18, 0, -16,  94, 4
189    db 0, 3,  58, -18, 0, -15,  96, 4, 0, 4,  55, -18, 0, -15,  98, 4
190    db 0, 3,  52, -17, 0, -14, 100, 4, 0, 3,  50, -17, 0, -14, 102, 4
191    db 0, 3,  47, -17, 0, -13, 104, 4, 0, 3,  45, -17, 0, -13, 106, 4
192    db 0, 3,  42, -16, 0, -12, 108, 3, 0, 3,  40, -16, 0, -11, 109, 3
193    db 0, 3,  37, -15, 0, -11, 111, 3, 0, 2,  35, -15, 0, -10, 113, 3
194    db 0, 3,  32, -14, 0, -10, 114, 3, 0, 2,  29, -13, 0,  -9, 116, 3
195    db 0, 2,  27, -13, 0,  -8, 117, 3, 0, 2,  25, -12, 0,  -8, 119, 2
196    db 0, 2,  22, -11, 0,  -7, 120, 2, 0, 1,  20, -10, 0,  -6, 121, 2
197    db 0, 1,  18,  -9, 0,  -6, 122, 2, 0, 1,  15,  -8, 0,  -5, 123, 2
198    db 0, 1,  13,  -7, 0,  -4, 124, 1, 0, 1,  11,  -6, 0,  -4, 125, 1
199    db 0, 1,   8,  -5, 0,  -3, 126, 1, 0, 1,   6,  -4, 0,  -2, 126, 1
200    db 0, 0,   4,  -3, 0,  -1, 127, 1, 0, 0,   2,  -1, 0,   0, 127, 0
201    db 0, 0,   2,  -1, 0,   0, 127, 0
202
203pw_258:  times 2 dw 258
204
205cextern mc_subpel_filters
206%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
207
208%macro BIDIR_JMP_TABLE 2-*
209    ;evaluated at definition time (in loop below)
210    %xdefine %1_%2_table (%%table - 2*%3)
211    %xdefine %%base %1_%2_table
212    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
213    ; dynamically generated label
214    %%table:
215    %rep %0 - 2 ; repeat for num args
216        dd %%prefix %+ .w%3 - %%base
217        %rotate 1
218    %endrep
219%endmacro
220
221BIDIR_JMP_TABLE avg, ssse3,        4, 8, 16, 32, 64, 128
222BIDIR_JMP_TABLE w_avg, ssse3,      4, 8, 16, 32, 64, 128
223BIDIR_JMP_TABLE mask, ssse3,       4, 8, 16, 32, 64, 128
224BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
225BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
226BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
227BIDIR_JMP_TABLE blend, ssse3,      4, 8, 16, 32
228BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
229BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16
230
231%macro BASE_JMP_TABLE 3-*
232    %xdefine %1_%2_table (%%table - %3)
233    %xdefine %%base %1_%2
234    %%table:
235    %rep %0 - 2
236        dw %%base %+ _w%3 - %%base
237        %rotate 1
238    %endrep
239%endmacro
240
241%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep)
242%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put)
243%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep)
244
245BASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
246BASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
247
248%macro HV_JMP_TABLE 5-*
249    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
250    %xdefine %%base %1_%3
251    %assign %%types %4
252    %if %%types & 1
253        %xdefine %1_%2_h_%3_table  (%%h  - %5)
254        %%h:
255        %rep %0 - 4
256            dw %%prefix %+ .h_w%5 - %%base
257            %rotate 1
258        %endrep
259        %rotate 4
260    %endif
261    %if %%types & 2
262        %xdefine %1_%2_v_%3_table  (%%v  - %5)
263        %%v:
264        %rep %0 - 4
265            dw %%prefix %+ .v_w%5 - %%base
266            %rotate 1
267        %endrep
268        %rotate 4
269    %endif
270    %if %%types & 4
271        %xdefine %1_%2_hv_%3_table (%%hv - %5)
272        %%hv:
273        %rep %0 - 4
274            dw %%prefix %+ .hv_w%5 - %%base
275            %rotate 1
276        %endrep
277    %endif
278%endmacro
279
280HV_JMP_TABLE prep,  8tap,  sse2, 1,    4, 8, 16, 32, 64, 128
281HV_JMP_TABLE prep, bilin,  sse2, 7,    4, 8, 16, 32, 64, 128
282HV_JMP_TABLE put,   8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
283HV_JMP_TABLE prep,  8tap, ssse3, 1,    4, 8, 16, 32, 64, 128
284HV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
285HV_JMP_TABLE prep, bilin, ssse3, 7,    4, 8, 16, 32, 64, 128
286
287%macro SCALED_JMP_TABLE 2-*
288    %xdefine %1_%2_table (%%table - %3)
289    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
290%%table:
291    %rep %0 - 2
292        dw %%base %+ .w%3 - %%base
293        %rotate 1
294    %endrep
295    %rotate 2
296%%dy_1024:
297    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
298    %rep %0 - 2
299        dw %%base %+ .dy1_w%3 - %%base
300        %rotate 1
301    %endrep
302    %rotate 2
303%%dy_2048:
304    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
305    %rep %0 - 2
306        dw %%base %+ .dy2_w%3 - %%base
307        %rotate 1
308    %endrep
309%endmacro
310
311SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
312SCALED_JMP_TABLE prep_8tap_scaled, ssse3,   4, 8, 16, 32, 64, 128
313
314%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
315
316SECTION .text
317
318INIT_XMM ssse3
319
320%if ARCH_X86_32
321 DECLARE_REG_TMP 1
322 %define base t0-put_ssse3
323%else
324 DECLARE_REG_TMP 7
325 %define base 0
326%endif
327
328%macro RESTORE_DSQ_32 1
329 %if ARCH_X86_32
330   mov                  %1, dsm ; restore dsq
331 %endif
332%endmacro
333
334cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy
335    movifnidn          mxyd, r6m ; mx
336    LEA                  t0, put_ssse3
337    movifnidn          srcq, srcmp
338    movifnidn           ssq, ssmp
339    tzcnt                wd, wm
340    mov                  hd, hm
341    test               mxyd, mxyd
342    jnz .h
343    mov                mxyd, r7m ; my
344    test               mxyd, mxyd
345    jnz .v
346.put:
347    movzx                wd, word [t0+wq*2+table_offset(put,)]
348    add                  wq, t0
349    RESTORE_DSQ_32       t0
350    jmp                  wq
351.put_w2:
352    movzx               r4d, word [srcq+ssq*0]
353    movzx               r6d, word [srcq+ssq*1]
354    lea                srcq, [srcq+ssq*2]
355    mov        [dstq+dsq*0], r4w
356    mov        [dstq+dsq*1], r6w
357    lea                dstq, [dstq+dsq*2]
358    sub                  hd, 2
359    jg .put_w2
360    RET
361.put_w4:
362    mov                 r4d, [srcq+ssq*0]
363    mov                 r6d, [srcq+ssq*1]
364    lea                srcq, [srcq+ssq*2]
365    mov        [dstq+dsq*0], r4d
366    mov        [dstq+dsq*1], r6d
367    lea                dstq, [dstq+dsq*2]
368    sub                  hd, 2
369    jg .put_w4
370    RET
371.put_w8:
372    movq                 m0, [srcq+ssq*0]
373    movq                 m1, [srcq+ssq*1]
374    lea                srcq, [srcq+ssq*2]
375    movq       [dstq+dsq*0], m0
376    movq       [dstq+dsq*1], m1
377    lea                dstq, [dstq+dsq*2]
378    sub                  hd, 2
379    jg .put_w8
380    RET
381.put_w16:
382    movu                 m0, [srcq+ssq*0]
383    movu                 m1, [srcq+ssq*1]
384    lea                srcq, [srcq+ssq*2]
385    mova       [dstq+dsq*0], m0
386    mova       [dstq+dsq*1], m1
387    lea                dstq, [dstq+dsq*2]
388    sub                  hd, 2
389    jg .put_w16
390    RET
391.put_w32:
392    movu                 m0, [srcq+ssq*0+16*0]
393    movu                 m1, [srcq+ssq*0+16*1]
394    movu                 m2, [srcq+ssq*1+16*0]
395    movu                 m3, [srcq+ssq*1+16*1]
396    lea                srcq, [srcq+ssq*2]
397    mova  [dstq+dsq*0+16*0], m0
398    mova  [dstq+dsq*0+16*1], m1
399    mova  [dstq+dsq*1+16*0], m2
400    mova  [dstq+dsq*1+16*1], m3
401    lea                dstq, [dstq+dsq*2]
402    sub                  hd, 2
403    jg .put_w32
404    RET
405.put_w64:
406    movu                 m0, [srcq+16*0]
407    movu                 m1, [srcq+16*1]
408    movu                 m2, [srcq+16*2]
409    movu                 m3, [srcq+16*3]
410    add                srcq, ssq
411    mova        [dstq+16*0], m0
412    mova        [dstq+16*1], m1
413    mova        [dstq+16*2], m2
414    mova        [dstq+16*3], m3
415    add                dstq, dsq
416    dec                  hd
417    jg .put_w64
418    RET
419.put_w128:
420    movu                 m0, [srcq+16*0]
421    movu                 m1, [srcq+16*1]
422    movu                 m2, [srcq+16*2]
423    movu                 m3, [srcq+16*3]
424    mova        [dstq+16*0], m0
425    mova        [dstq+16*1], m1
426    mova        [dstq+16*2], m2
427    mova        [dstq+16*3], m3
428    movu                 m0, [srcq+16*4]
429    movu                 m1, [srcq+16*5]
430    movu                 m2, [srcq+16*6]
431    movu                 m3, [srcq+16*7]
432    mova        [dstq+16*4], m0
433    mova        [dstq+16*5], m1
434    mova        [dstq+16*6], m2
435    mova        [dstq+16*7], m3
436    add                srcq, ssq
437    add                dstq, dsq
438    dec                  hd
439    jg .put_w128
440    RET
441.h:
442    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
443    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
444    imul               mxyd, 0x00ff00ff
445    mova                 m4, [base+bilin_h_shuf8]
446    mova                 m0, [base+bilin_h_shuf4]
447    add                mxyd, 0x00100010
448    movd                 m5, mxyd
449    mov                mxyd, r7m ; my
450    pshufd               m5, m5, q0000
451    test               mxyd, mxyd
452    jnz .hv
453    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_h)]
454    mova                 m3, [base+pw_2048]
455    add                  wq, t0
456    movifnidn           dsq, dsmp
457    jmp                  wq
458.h_w2:
459    pshufd               m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
460.h_w2_loop:
461    movd                 m0, [srcq+ssq*0]
462    movd                 m1, [srcq+ssq*1]
463    lea                srcq, [srcq+ssq*2]
464    punpckldq            m0, m1
465    pshufb               m0, m4
466    pmaddubsw            m0, m5
467    pmulhrsw             m0, m3
468    packuswb             m0, m0
469    movd                r6d, m0
470    mov        [dstq+dsq*0], r6w
471    shr                 r6d, 16
472    mov        [dstq+dsq*1], r6w
473    lea                dstq, [dstq+dsq*2]
474    sub                  hd, 2
475    jg .h_w2_loop
476    RET
477.h_w4:
478    movq                 m4, [srcq+ssq*0]
479    movhps               m4, [srcq+ssq*1]
480    lea                srcq, [srcq+ssq*2]
481    pshufb               m4, m0
482    pmaddubsw            m4, m5
483    pmulhrsw             m4, m3
484    packuswb             m4, m4
485    movd       [dstq+dsq*0], m4
486    psrlq                m4, 32
487    movd       [dstq+dsq*1], m4
488    lea                dstq, [dstq+dsq*2]
489    sub                  hd, 2
490    jg .h_w4
491    RET
492.h_w8:
493    movu                 m0, [srcq+ssq*0]
494    movu                 m1, [srcq+ssq*1]
495    lea                srcq, [srcq+ssq*2]
496    pshufb               m0, m4
497    pshufb               m1, m4
498    pmaddubsw            m0, m5
499    pmaddubsw            m1, m5
500    pmulhrsw             m0, m3
501    pmulhrsw             m1, m3
502    packuswb             m0, m1
503    movq       [dstq+dsq*0], m0
504    movhps     [dstq+dsq*1], m0
505    lea                dstq, [dstq+dsq*2]
506    sub                  hd, 2
507    jg .h_w8
508    RET
509.h_w16:
510    movu                 m0, [srcq+8*0]
511    movu                 m1, [srcq+8*1]
512    add                srcq, ssq
513    pshufb               m0, m4
514    pshufb               m1, m4
515    pmaddubsw            m0, m5
516    pmaddubsw            m1, m5
517    pmulhrsw             m0, m3
518    pmulhrsw             m1, m3
519    packuswb             m0, m1
520    mova             [dstq], m0
521    add                dstq, dsq
522    dec                  hd
523    jg .h_w16
524    RET
525.h_w32:
526    movu                 m0, [srcq+mmsize*0+8*0]
527    movu                 m1, [srcq+mmsize*0+8*1]
528    pshufb               m0, m4
529    pshufb               m1, m4
530    pmaddubsw            m0, m5
531    pmaddubsw            m1, m5
532    pmulhrsw             m0, m3
533    pmulhrsw             m1, m3
534    packuswb             m0, m1
535    movu                 m1, [srcq+mmsize*1+8*0]
536    movu                 m2, [srcq+mmsize*1+8*1]
537    add                srcq, ssq
538    pshufb               m1, m4
539    pshufb               m2, m4
540    pmaddubsw            m1, m5
541    pmaddubsw            m2, m5
542    pmulhrsw             m1, m3
543    pmulhrsw             m2, m3
544    packuswb             m1, m2
545    mova        [dstq+16*0], m0
546    mova        [dstq+16*1], m1
547    add                dstq, dsq
548    dec                  hd
549    jg .h_w32
550    RET
551.h_w64:
552    mov                  r6, -16*3
553.h_w64_loop:
554    movu                 m0, [srcq+r6+16*3+8*0]
555    movu                 m1, [srcq+r6+16*3+8*1]
556    pshufb               m0, m4
557    pshufb               m1, m4
558    pmaddubsw            m0, m5
559    pmaddubsw            m1, m5
560    pmulhrsw             m0, m3
561    pmulhrsw             m1, m3
562    packuswb             m0, m1
563    mova     [dstq+r6+16*3], m0
564    add                  r6, 16
565    jle .h_w64_loop
566    add                srcq, ssq
567    add                dstq, dsq
568    dec                  hd
569    jg .h_w64
570    RET
571.h_w128:
572    mov                  r6, -16*7
573.h_w128_loop:
574    movu                 m0, [srcq+r6+16*7+8*0]
575    movu                 m1, [srcq+r6+16*7+8*1]
576    pshufb               m0, m4
577    pshufb               m1, m4
578    pmaddubsw            m0, m5
579    pmaddubsw            m1, m5
580    pmulhrsw             m0, m3
581    pmulhrsw             m1, m3
582    packuswb             m0, m1
583    mova     [dstq+r6+16*7], m0
584    add                  r6, 16
585    jle .h_w128_loop
586    add                srcq, ssq
587    add                dstq, dsq
588    dec                  hd
589    jg .h_w128
590    RET
591.v:
592    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
593    imul               mxyd, 0x00ff00ff
594    mova                 m5, [base+pw_2048]
595    add                mxyd, 0x00100010
596    add                  wq, t0
597    movd                 m4, mxyd
598    pshufd               m4, m4, q0000
599    movifnidn           dsq, dsmp
600    jmp                  wq
601.v_w2:
602    movd                 m0, [srcq+ssq*0]
603.v_w2_loop:
604    pinsrw               m0, [srcq+ssq*1], 1 ; 0 1
605    lea                srcq, [srcq+ssq*2]
606    pshuflw              m1, m0, q2301
607    pinsrw               m0, [srcq+ssq*0], 0 ; 2 1
608    punpcklbw            m1, m0
609    pmaddubsw            m1, m4
610    pmulhrsw             m1, m5
611    packuswb             m1, m1
612    movd                r6d, m1
613    mov        [dstq+dsq*1], r6w
614    shr                 r6d, 16
615    mov        [dstq+dsq*0], r6w
616    lea                dstq, [dstq+dsq*2]
617    sub                  hd, 2
618    jg .v_w2_loop
619    RET
620.v_w4:
621    movd                 m0, [srcq+ssq*0]
622.v_w4_loop:
623    movd                 m2, [srcq+ssq*1]
624    lea                srcq, [srcq+ssq*2]
625    mova                 m1, m0
626    movd                 m0, [srcq+ssq*0]
627    punpckldq            m1, m2 ; 0 1
628    punpckldq            m2, m0 ; 1 2
629    punpcklbw            m1, m2
630    pmaddubsw            m1, m4
631    pmulhrsw             m1, m5
632    packuswb             m1, m1
633    movd       [dstq+dsq*0], m1
634    psrlq                m1, 32
635    movd       [dstq+dsq*1], m1
636    ;
637    lea                dstq, [dstq+dsq*2]
638    sub                  hd, 2
639    jg .v_w4_loop
640    RET
641.v_w8:
642    movq                 m0, [srcq+ssq*0]
643.v_w8_loop:
644    movq                 m2, [srcq+ssq*1]
645    lea                srcq, [srcq+ssq*2]
646    mova                 m1, m0
647    movq                 m0, [srcq+ssq*0]
648    punpcklbw            m1, m2
649    punpcklbw            m2, m0
650    pmaddubsw            m1, m4
651    pmaddubsw            m2, m4
652    pmulhrsw             m1, m5
653    pmulhrsw             m2, m5
654    packuswb             m1, m2
655    movq       [dstq+dsq*0], m1
656    movhps     [dstq+dsq*1], m1
657    lea                dstq, [dstq+dsq*2]
658    sub                  hd, 2
659    jg .v_w8_loop
660    RET
661%macro PUT_BILIN_V_W16 0
662    movu                 m0, [srcq+ssq*0]
663%%loop:
664    movu                 m3, [srcq+ssq*1]
665    lea                srcq, [srcq+ssq*2]
666    mova                 m1, m0
667    mova                 m2, m0
668    movu                 m0, [srcq+ssq*0]
669    punpcklbw            m1, m3
670    punpckhbw            m2, m3
671    pmaddubsw            m1, m4
672    pmaddubsw            m2, m4
673    pmulhrsw             m1, m5
674    pmulhrsw             m2, m5
675    packuswb             m1, m2
676    punpcklbw            m2, m3, m0
677    punpckhbw            m3, m0
678    pmaddubsw            m2, m4
679    pmaddubsw            m3, m4
680    pmulhrsw             m2, m5
681    pmulhrsw             m3, m5
682    packuswb             m2, m3
683    mova       [dstq+dsq*0], m1
684    mova       [dstq+dsq*1], m2
685    lea                dstq, [dstq+dsq*2]
686    sub                  hd, 2
687    jg %%loop
688%endmacro
689.v_w16:
690    PUT_BILIN_V_W16
691    RET
692.v_w128:
693    lea                 r6d, [hq+(7<<16)]
694    jmp .v_w16gt
695.v_w64:
696    lea                 r6d, [hq+(3<<16)]
697    jmp .v_w16gt
698.v_w32:
699    lea                 r6d, [hq+(1<<16)]
700.v_w16gt:
701    mov                  r4, srcq
702%if ARCH_X86_64
703    mov                  r7, dstq
704%endif
705.v_w16gt_loop:
706    PUT_BILIN_V_W16
707%if ARCH_X86_64
708    add                  r4, 16
709    add                  r7, 16
710    movzx                hd, r6b
711    mov                srcq, r4
712    mov                dstq, r7
713%else
714    mov                dstq, dstmp
715    add                  r4, 16
716    movzx                hd, r6w
717    add                dstq, 16
718    mov                srcq, r4
719    mov               dstmp, dstq
720%endif
721    sub                 r6d, 1<<16
722    jg .v_w16gt
723    RET
724.hv:
725    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
726    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
727    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
728    WIN64_SPILL_XMM       8
729    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
730    mova                 m7, [base+pw_15]
731    movd                 m6, mxyd
732    add                  wq, t0
733    pshuflw              m6, m6, q0000
734    paddb                m5, m5
735    punpcklqdq           m6, m6
736    jmp                  wq
737.hv_w2:
738    RESTORE_DSQ_32       t0
739    movd                 m0, [srcq+ssq*0]
740    punpckldq            m0, m0
741    pshufb               m0, m4
742    pmaddubsw            m0, m5
743.hv_w2_loop:
744    movd                 m1, [srcq+ssq*1]
745    lea                srcq, [srcq+ssq*2]
746    movd                 m2, [srcq+ssq*0]
747    punpckldq            m1, m2
748    pshufb               m1, m4
749    pmaddubsw            m1, m5             ; 1 _ 2 _
750    shufps               m2, m0, m1, q1032  ; 0 _ 1 _
751    mova                 m0, m1
752    psubw                m1, m2   ; 2 * (src[x + src_stride] - src[x])
753    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x]) >> 4
754    pavgw                m2, m7   ; src[x] + 8
755    paddw                m1, m2   ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
756    psrlw                m1, 4
757    packuswb             m1, m1
758%if ARCH_X86_64
759    movq                 r6, m1
760%else
761    pshuflw              m1, m1, q2020
762    movd                r6d, m1
763%endif
764    mov        [dstq+dsq*0], r6w
765    shr                  r6, gprsize*4
766    mov        [dstq+dsq*1], r6w
767    lea                dstq, [dstq+dsq*2]
768    sub                  hd, 2
769    jg .hv_w2_loop
770    RET
771.hv_w4:
772    mova                 m4, [base+bilin_h_shuf4]
773    movddup              m0, [srcq+ssq*0]
774    movifnidn           dsq, dsmp
775    pshufb               m0, m4
776    pmaddubsw            m0, m5
777.hv_w4_loop:
778    movq                 m1, [srcq+ssq*1]
779    lea                srcq, [srcq+ssq*2]
780    movhps               m1, [srcq+ssq*0]
781    pshufb               m1, m4
782    pmaddubsw            m1, m5            ; 1 2
783    shufps               m2, m0, m1, q1032 ; 0 1
784    mova                 m0, m1
785    psubw                m1, m2
786    pmulhw               m1, m6
787    pavgw                m2, m7
788    paddw                m1, m2
789    psrlw                m1, 4
790    packuswb             m1, m1
791    movd       [dstq+dsq*0], m1
792    psrlq                m1, 32
793    movd       [dstq+dsq*1], m1
794    lea                dstq, [dstq+dsq*2]
795    sub                  hd, 2
796    jg .hv_w4_loop
797    RET
798.hv_w8:
799    movu                 m0, [srcq+ssq*0]
800    movifnidn           dsq, dsmp
801    pshufb               m0, m4
802    pmaddubsw            m0, m5
803.hv_w8_loop:
804    movu                 m2, [srcq+ssq*1]
805    lea                srcq, [srcq+ssq*2]
806    pshufb               m2, m4
807    pmaddubsw            m2, m5
808    psubw                m1, m2, m0
809    pmulhw               m1, m6
810    pavgw                m0, m7
811    paddw                m1, m0
812    movu                 m0, [srcq+ssq*0]
813    pshufb               m0, m4
814    pmaddubsw            m0, m5
815    psubw                m3, m0, m2
816    pmulhw               m3, m6
817    pavgw                m2, m7
818    paddw                m3, m2
819    psrlw                m1, 4
820    psrlw                m3, 4
821    packuswb             m1, m3
822    movq       [dstq+dsq*0], m1
823    movhps     [dstq+dsq*1], m1
824    lea                dstq, [dstq+dsq*2]
825    sub                  hd, 2
826    jg .hv_w8_loop
827    RET
828.hv_w128:
829    lea                 r6d, [hq+(7<<16)]
830    jmp .hv_w16_start
831.hv_w64:
832    lea                 r6d, [hq+(3<<16)]
833    jmp .hv_w16_start
834.hv_w32:
835    lea                 r6d, [hq+(1<<16)]
836.hv_w16_start:
837    mov                  r4, srcq
838%if ARCH_X86_32
839    %define m8 [dstq]
840%else
841    mov                  r7, dstq
842%endif
843.hv_w16:
844    movifnidn           dsq, dsmp
845%if WIN64
846    movaps              r4m, m8
847%endif
848.hv_w16_loop0:
849    movu                 m0, [srcq+8*0]
850    movu                 m1, [srcq+8*1]
851    pshufb               m0, m4
852    pshufb               m1, m4
853    pmaddubsw            m0, m5
854    pmaddubsw            m1, m5
855.hv_w16_loop:
856    add                srcq, ssq
857    movu                 m2, [srcq+8*0]
858    movu                 m3, [srcq+8*1]
859    pshufb               m2, m4
860    pshufb               m3, m4
861    pmaddubsw            m2, m5
862    pmaddubsw            m3, m5
863    mova                 m8, m2
864    psubw                m2, m0
865    pmulhw               m2, m6
866    pavgw                m0, m7
867    paddw                m2, m0
868    mova                 m0, m3
869    psubw                m3, m1
870    pmulhw               m3, m6
871    pavgw                m1, m7
872    paddw                m3, m1
873    mova                 m1, m0
874    mova                 m0, m8
875    psrlw                m2, 4
876    psrlw                m3, 4
877    packuswb             m2, m3
878    mova             [dstq], m2
879    add                dstq, dsmp
880    dec                  hd
881    jg .hv_w16_loop
882%if ARCH_X86_32
883    mov                dstq, dstm
884    add                  r4, 16
885    movzx                hd, r6w
886    add                dstq, 16
887    mov                srcq, r4
888    mov                dstm, dstq
889%else
890    add                  r4, 16
891    add                  r7, 16
892    movzx                hd, r6b
893    mov                srcq, r4
894    mov                dstq, r7
895%endif
896    sub                 r6d, 1<<16
897    jg .hv_w16_loop0
898%if WIN64
899    movaps               m8, r4m
900%endif
901    RET
902
903%macro PSHUFB_BILIN_H8 2 ; dst, src
904 %if cpuflag(ssse3)
905    pshufb               %1, %2
906 %else
907    psrldq               %2, %1, 1
908    punpcklbw            %1, %2
909 %endif
910%endmacro
911
912%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp
913 %if cpuflag(ssse3)
914    pshufb               %1, %2
915 %else
916    psrldq               %2, %1, 1
917    punpckhbw            %3, %1, %2
918    punpcklbw            %1, %2
919    punpcklqdq           %1, %3
920 %endif
921%endmacro
922
923%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero
924 %if cpuflag(ssse3)
925    pmaddubsw            %1, %2
926 %else
927  %if %5 == 1
928    pxor                 %3, %3
929  %endif
930    punpckhbw            %4, %1, %3
931    punpcklbw            %1, %1, %3
932    pmaddwd              %4, %2
933    pmaddwd              %1, %2
934    packssdw             %1, %4
935 %endif
936%endmacro
937
938%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift
939 %if cpuflag(ssse3)
940    pmulhrsw             %1, %2
941 %else
942    punpckhwd            %3, %1, %4
943    punpcklwd            %1, %4
944    pmaddwd              %3, %2
945    pmaddwd              %1, %2
946    psrad                %3, %5
947    psrad                %1, %5
948    packssdw             %1, %3
949 %endif
950%endmacro
951
952%macro PREP_BILIN 0
953%if ARCH_X86_32
954    %define base r6-prep%+SUFFIX
955%else
956    %define base 0
957%endif
958
959cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
960    movifnidn          mxyd, r5m ; mx
961    LEA                  r6, prep%+SUFFIX
962    tzcnt                wd, wm
963    movifnidn            hd, hm
964    test               mxyd, mxyd
965    jnz .h
966    mov                mxyd, r6m ; my
967    test               mxyd, mxyd
968    jnz .v
969.prep:
970%if notcpuflag(ssse3)
971    add                  r6, prep_ssse3 - prep_sse2
972    jmp prep_ssse3
973%else
974    movzx                wd, word [r6+wq*2+table_offset(prep,)]
975    pxor                 m4, m4
976    add                  wq, r6
977    lea            stride3q, [strideq*3]
978    jmp                  wq
979.prep_w4:
980    movd                 m0, [srcq+strideq*0]
981    movd                 m1, [srcq+strideq*1]
982    movd                 m2, [srcq+strideq*2]
983    movd                 m3, [srcq+stride3q ]
984    lea                srcq, [srcq+strideq*4]
985    punpckldq            m0, m1
986    punpckldq            m2, m3
987    punpcklbw            m0, m4
988    punpcklbw            m2, m4
989    psllw                m0, 4
990    psllw                m2, 4
991    mova        [tmpq+16*0], m0
992    mova        [tmpq+16*1], m2
993    add                tmpq, 16*2
994    sub                  hd, 4
995    jg .prep_w4
996    RET
997.prep_w8:
998    movq                 m0, [srcq+strideq*0]
999    movq                 m1, [srcq+strideq*1]
1000    movq                 m2, [srcq+strideq*2]
1001    movq                 m3, [srcq+stride3q ]
1002    lea                srcq, [srcq+strideq*4]
1003    punpcklbw            m0, m4
1004    punpcklbw            m1, m4
1005    punpcklbw            m2, m4
1006    punpcklbw            m3, m4
1007    psllw                m0, 4
1008    psllw                m1, 4
1009    psllw                m2, 4
1010    psllw                m3, 4
1011    mova        [tmpq+16*0], m0
1012    mova        [tmpq+16*1], m1
1013    mova        [tmpq+16*2], m2
1014    mova        [tmpq+16*3], m3
1015    add                tmpq, 16*4
1016    sub                  hd, 4
1017    jg .prep_w8
1018    RET
1019.prep_w16:
1020    movu                 m1, [srcq+strideq*0]
1021    movu                 m3, [srcq+strideq*1]
1022    lea                srcq, [srcq+strideq*2]
1023    punpcklbw            m0, m1, m4
1024    punpckhbw            m1, m4
1025    punpcklbw            m2, m3, m4
1026    punpckhbw            m3, m4
1027    psllw                m0, 4
1028    psllw                m1, 4
1029    psllw                m2, 4
1030    psllw                m3, 4
1031    mova        [tmpq+16*0], m0
1032    mova        [tmpq+16*1], m1
1033    mova        [tmpq+16*2], m2
1034    mova        [tmpq+16*3], m3
1035    add                tmpq, 16*4
1036    sub                  hd, 2
1037    jg .prep_w16
1038    RET
1039.prep_w128:
1040    mov                  r3, -128
1041    jmp .prep_w32_start
1042.prep_w64:
1043    mov                  r3, -64
1044    jmp .prep_w32_start
1045.prep_w32:
1046    mov                  r3, -32
1047.prep_w32_start:
1048    sub                srcq, r3
1049.prep_w32_vloop:
1050    mov                  r6, r3
1051.prep_w32_hloop:
1052    movu                 m1, [srcq+r6+16*0]
1053    movu                 m3, [srcq+r6+16*1]
1054    punpcklbw            m0, m1, m4
1055    punpckhbw            m1, m4
1056    punpcklbw            m2, m3, m4
1057    punpckhbw            m3, m4
1058    psllw                m0, 4
1059    psllw                m1, 4
1060    psllw                m2, 4
1061    psllw                m3, 4
1062    mova        [tmpq+16*0], m0
1063    mova        [tmpq+16*1], m1
1064    mova        [tmpq+16*2], m2
1065    mova        [tmpq+16*3], m3
1066    add                tmpq, 16*4
1067    add                  r6, 32
1068    jl .prep_w32_hloop
1069    add                srcq, strideq
1070    dec                  hd
1071    jg .prep_w32_vloop
1072    RET
1073%endif
1074.h:
1075    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
1076    ; = (16 - mx) * src[x] + mx * src[x + 1]
1077%if cpuflag(ssse3)
1078    imul               mxyd, 0x00ff00ff
1079    mova                 m4, [base+bilin_h_shuf8]
1080    add                mxyd, 0x00100010
1081%else
1082    imul               mxyd, 0xffff
1083    add                mxyd, 16
1084%endif
1085    movd                 m5, mxyd
1086    mov                mxyd, r6m ; my
1087    pshufd               m5, m5, q0000
1088    test               mxyd, mxyd
1089    jnz .hv
1090    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
1091%if notcpuflag(ssse3)
1092    WIN64_SPILL_XMM 8
1093    pxor                 m6, m6
1094%endif
1095    add                  wq, r6
1096    jmp                  wq
1097.h_w4:
1098%if cpuflag(ssse3)
1099    mova                 m4, [base+bilin_h_shuf4]
1100%endif
1101    lea            stride3q, [strideq*3]
1102.h_w4_loop:
1103    movq                 m0, [srcq+strideq*0]
1104    movhps               m0, [srcq+strideq*1]
1105    movq                 m1, [srcq+strideq*2]
1106    movhps               m1, [srcq+stride3q ]
1107    lea                srcq, [srcq+strideq*4]
1108    PSHUFB_BILIN_H4      m0, m4, m2
1109    PMADDUBSW            m0, m5, m6, m2, 0
1110    PSHUFB_BILIN_H4      m1, m4, m2
1111    PMADDUBSW            m1, m5, m6, m2, 0
1112    mova          [tmpq+0 ], m0
1113    mova          [tmpq+16], m1
1114    add                tmpq, 32
1115    sub                  hd, 4
1116    jg .h_w4_loop
1117    RET
1118.h_w8:
1119    lea            stride3q, [strideq*3]
1120.h_w8_loop:
1121    movu                 m0, [srcq+strideq*0]
1122    movu                 m1, [srcq+strideq*1]
1123    movu                 m2, [srcq+strideq*2]
1124    movu                 m3, [srcq+stride3q ]
1125    lea                srcq, [srcq+strideq*4]
1126    PSHUFB_BILIN_H8      m0, m4
1127    PSHUFB_BILIN_H8      m1, m4
1128    PSHUFB_BILIN_H8      m2, m4
1129    PSHUFB_BILIN_H8      m3, m4
1130    PMADDUBSW            m0, m5, m6, m7, 0
1131    PMADDUBSW            m1, m5, m6, m7, 0
1132    PMADDUBSW            m2, m5, m6, m7, 0
1133    PMADDUBSW            m3, m5, m6, m7, 0
1134    mova        [tmpq+16*0], m0
1135    mova        [tmpq+16*1], m1
1136    mova        [tmpq+16*2], m2
1137    mova        [tmpq+16*3], m3
1138    add                tmpq, 16*4
1139    sub                  hd, 4
1140    jg .h_w8_loop
1141    RET
1142.h_w16:
1143    movu                 m0, [srcq+strideq*0+8*0]
1144    movu                 m1, [srcq+strideq*0+8*1]
1145    movu                 m2, [srcq+strideq*1+8*0]
1146    movu                 m3, [srcq+strideq*1+8*1]
1147    lea                srcq, [srcq+strideq*2]
1148    PSHUFB_BILIN_H8      m0, m4
1149    PSHUFB_BILIN_H8      m1, m4
1150    PSHUFB_BILIN_H8      m2, m4
1151    PSHUFB_BILIN_H8      m3, m4
1152    PMADDUBSW            m0, m5, m6, m7, 0
1153    PMADDUBSW            m1, m5, m6, m7, 0
1154    PMADDUBSW            m2, m5, m6, m7, 0
1155    PMADDUBSW            m3, m5, m6, m7, 0
1156    mova        [tmpq+16*0], m0
1157    mova        [tmpq+16*1], m1
1158    mova        [tmpq+16*2], m2
1159    mova        [tmpq+16*3], m3
1160    add                tmpq, 16*4
1161    sub                  hd, 2
1162    jg .h_w16
1163    RET
1164.h_w128:
1165    mov                  r3, -128
1166    jmp .h_w32_start
1167.h_w64:
1168    mov                  r3, -64
1169    jmp .h_w32_start
1170.h_w32:
1171    mov                  r3, -32
1172.h_w32_start:
1173    sub                srcq, r3
1174.h_w32_vloop:
1175    mov                  r6, r3
1176.h_w32_hloop:
1177    movu                 m0, [srcq+r6+8*0]
1178    movu                 m1, [srcq+r6+8*1]
1179    movu                 m2, [srcq+r6+8*2]
1180    movu                 m3, [srcq+r6+8*3]
1181    PSHUFB_BILIN_H8      m0, m4
1182    PSHUFB_BILIN_H8      m1, m4
1183    PSHUFB_BILIN_H8      m2, m4
1184    PSHUFB_BILIN_H8      m3, m4
1185    PMADDUBSW            m0, m5, m6, m7, 0
1186    PMADDUBSW            m1, m5, m6, m7, 0
1187    PMADDUBSW            m2, m5, m6, m7, 0
1188    PMADDUBSW            m3, m5, m6, m7, 0
1189    mova        [tmpq+16*0], m0
1190    mova        [tmpq+16*1], m1
1191    mova        [tmpq+16*2], m2
1192    mova        [tmpq+16*3], m3
1193    add                tmpq, 16*4
1194    add                  r6, 32
1195    jl .h_w32_hloop
1196    add                srcq, strideq
1197    dec                  hd
1198    jg .h_w32_vloop
1199    RET
1200.v:
1201%if notcpuflag(ssse3)
1202    WIN64_SPILL_XMM 8
1203%endif
1204    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
1205%if cpuflag(ssse3)
1206    imul               mxyd, 0x00ff00ff
1207    add                mxyd, 0x00100010
1208%else
1209    imul               mxyd, 0xffff
1210    pxor                 m6, m6
1211    add                mxyd, 16
1212%endif
1213    add                  wq, r6
1214    lea            stride3q, [strideq*3]
1215    movd                 m5, mxyd
1216    pshufd               m5, m5, q0000
1217    jmp                  wq
1218.v_w4:
1219    movd                 m0, [srcq+strideq*0]
1220.v_w4_loop:
1221    movd                 m1, [srcq+strideq*1]
1222    movd                 m2, [srcq+strideq*2]
1223    movd                 m3, [srcq+stride3q ]
1224    lea                srcq, [srcq+strideq*4]
1225    punpckldq            m0, m1
1226    punpckldq            m1, m2
1227    punpcklbw            m0, m1 ; 01 12
1228    PMADDUBSW            m0, m5, m6, m7, 0
1229    mova        [tmpq+16*0], m0
1230    movd                 m0, [srcq+strideq*0]
1231    punpckldq            m2, m3
1232    punpckldq            m3, m0
1233    punpcklbw            m2, m3 ; 23 34
1234    PMADDUBSW            m2, m5, m6, m7, 0
1235    mova        [tmpq+16*1], m2
1236    add                tmpq, 16*2
1237    sub                  hd, 4
1238    jg .v_w4_loop
1239    RET
1240.v_w8:
1241    movq                 m0, [srcq+strideq*0]
1242.v_w8_loop:
1243    movq                 m1, [srcq+strideq*1]
1244    movq                 m2, [srcq+strideq*2]
1245    movq                 m3, [srcq+stride3q ]
1246    lea                srcq, [srcq+strideq*4]
1247    punpcklbw            m0, m1 ; 01
1248    punpcklbw            m1, m2 ; 12
1249    PMADDUBSW            m0, m5, m6, m7, 0
1250    PMADDUBSW            m1, m5, m6, m7, 0
1251    mova        [tmpq+16*0], m0
1252    movq                 m0, [srcq+strideq*0]
1253    punpcklbw            m2, m3 ; 23
1254    punpcklbw            m3, m0 ; 34
1255    PMADDUBSW            m2, m5, m6, m7, 0
1256    mova        [tmpq+16*1], m1
1257    PMADDUBSW            m3, m5, m6, m7, 0
1258    mova        [tmpq+16*2], m2
1259    mova        [tmpq+16*3], m3
1260    add                tmpq, 16*4
1261    sub                  hd, 4
1262    jg .v_w8_loop
1263    RET
1264.v_w16:
1265    movu                 m0, [srcq+strideq*0]
1266.v_w16_loop:
1267    movu                 m1, [srcq+strideq*1]
1268    movu                 m2, [srcq+strideq*2]
1269    movu                 m3, [srcq+stride3q ]
1270    lea                srcq, [srcq+strideq*4]
1271    punpcklbw            m4, m0, m1
1272    punpckhbw            m0, m1
1273    PMADDUBSW            m4, m5, m6, m7, 0
1274    PMADDUBSW            m0, m5, m6, m7, 0
1275    mova        [tmpq+16*0], m4
1276    punpcklbw            m4, m1, m2
1277    punpckhbw            m1, m2
1278    PMADDUBSW            m4, m5, m6, m7, 0
1279    mova        [tmpq+16*1], m0
1280    movu                 m0, [srcq+strideq*0]
1281    PMADDUBSW            m1, m5, m6, m7, 0
1282    mova        [tmpq+16*2], m4
1283    punpcklbw            m4, m2, m3
1284    punpckhbw            m2, m3
1285    PMADDUBSW            m4, m5, m6, m7, 0
1286    mova        [tmpq+16*3], m1
1287    PMADDUBSW            m2, m5, m6, m7, 0
1288    mova        [tmpq+16*4], m4
1289    punpcklbw            m4, m3, m0
1290    punpckhbw            m3, m0
1291    PMADDUBSW            m4, m5, m6, m7, 0
1292    mova        [tmpq+16*5], m2
1293    PMADDUBSW            m3, m5, m6, m7, 0
1294    mova        [tmpq+16*6], m4
1295    mova        [tmpq+16*7], m3
1296    add                tmpq, 16*8
1297    sub                  hd, 4
1298    jg .v_w16_loop
1299    RET
1300.v_w128:
1301    lea                 r3d, [hq+(3<<8)]
1302    mov                 r6d, 256
1303    jmp .v_w32_start
1304.v_w64:
1305    lea                 r3d, [hq+(1<<8)]
1306    mov                 r6d, 128
1307    jmp .v_w32_start
1308.v_w32:
1309    xor                 r3d, r3d
1310    mov                 r6d, 64
1311.v_w32_start:
1312%if ARCH_X86_64
1313 %if WIN64
1314    PUSH                 r7
1315 %endif
1316    mov                  r7, tmpq
1317%endif
1318    mov                  r5, srcq
1319.v_w32_hloop:
1320    movu                 m0, [srcq+strideq*0+16*0]
1321    movu                 m1, [srcq+strideq*0+16*1]
1322.v_w32_vloop:
1323    movu                 m2, [srcq+strideq*1+16*0]
1324    movu                 m3, [srcq+strideq*1+16*1]
1325    lea                srcq, [srcq+strideq*2]
1326    punpcklbw            m4, m0, m2
1327    punpckhbw            m0, m2
1328    PMADDUBSW            m4, m5, m6, m7, 0
1329    PMADDUBSW            m0, m5, m6, m7, 0
1330    mova        [tmpq+16*0], m4
1331    mova        [tmpq+16*1], m0
1332    movu                 m0, [srcq+strideq*0+16*0]
1333    punpcklbw            m4, m1, m3
1334    punpckhbw            m1, m3
1335    PMADDUBSW            m4, m5, m6, m7, 0
1336    PMADDUBSW            m1, m5, m6, m7, 0
1337    mova        [tmpq+16*2], m4
1338    mova        [tmpq+16*3], m1
1339    movu                 m1, [srcq+strideq*0+16*1]
1340    add                tmpq, r6
1341    punpcklbw            m4, m2, m0
1342    punpckhbw            m2, m0
1343    PMADDUBSW            m4, m5, m6, m7, 0
1344    PMADDUBSW            m2, m5, m6, m7, 0
1345    mova        [tmpq+16*0], m4
1346    mova        [tmpq+16*1], m2
1347    punpcklbw            m4, m3, m1
1348    punpckhbw            m3, m1
1349    PMADDUBSW            m4, m5, m6, m7, 0
1350    PMADDUBSW            m3, m5, m6, m7, 0
1351    mova        [tmpq+16*2], m4
1352    mova        [tmpq+16*3], m3
1353    add                tmpq, r6
1354    sub                  hd, 2
1355    jg .v_w32_vloop
1356    add                  r5, 32
1357    movzx                hd, r3b
1358    mov                srcq, r5
1359%if ARCH_X86_64
1360    add                  r7, 16*4
1361    mov                tmpq, r7
1362%else
1363    mov                tmpq, tmpmp
1364    add                tmpq, 16*4
1365    mov               tmpmp, tmpq
1366%endif
1367    sub                 r3d, 1<<8
1368    jg .v_w32_hloop
1369%if WIN64
1370    POP                  r7
1371%endif
1372    RET
1373.hv:
1374    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
1375    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
1376    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1377%if cpuflag(ssse3)
1378    imul               mxyd, 0x08000800
1379    WIN64_SPILL_XMM 8
1380%else
1381    or                 mxyd, 1<<16
1382    WIN64_SPILL_XMM 9
1383 %if ARCH_X86_64
1384    mova                 m8, [base+pw_8]
1385 %else
1386  %define                m8  [base+pw_8]
1387 %endif
1388    pxor                 m7, m7
1389%endif
1390    movd                 m6, mxyd
1391    add                  wq, r6
1392    pshufd               m6, m6, q0000
1393    jmp                  wq
1394.hv_w4:
1395%if cpuflag(ssse3)
1396    mova                 m4, [base+bilin_h_shuf4]
1397    movddup              m0, [srcq+strideq*0]
1398%else
1399    movhps               m0, [srcq+strideq*0]
1400%endif
1401    lea                  r3, [strideq*3]
1402    PSHUFB_BILIN_H4      m0, m4, m3
1403    PMADDUBSW            m0, m5, m7, m4, 0 ; _ 0
1404.hv_w4_loop:
1405    movq                 m1, [srcq+strideq*1]
1406    movhps               m1, [srcq+strideq*2]
1407    movq                 m2, [srcq+r3       ]
1408    lea                srcq, [srcq+strideq*4]
1409    movhps               m2, [srcq+strideq*0]
1410    PSHUFB_BILIN_H4      m1, m4, m3
1411    PSHUFB_BILIN_H4      m2, m4, m3
1412    PMADDUBSW            m1, m5, m7, m4, 0 ; 1 2
1413    PMADDUBSW            m2, m5, m7, m4, 0 ; 3 4
1414    shufpd               m0, m1, 0x01      ; 0 1
1415    shufpd               m3, m1, m2, 0x01  ; 2 3
1416    psubw                m1, m0
1417    PMULHRSW             m1, m6, m4, m8, 4
1418    paddw                m1, m0
1419    mova                 m0, m2
1420    psubw                m2, m3
1421    PMULHRSW             m2, m6, m4, m8, 4
1422    paddw                m2, m3
1423    mova        [tmpq+16*0], m1
1424    mova        [tmpq+16*1], m2
1425    add                tmpq, 32
1426    sub                  hd, 4
1427    jg .hv_w4_loop
1428    RET
1429.hv_w8:
1430    movu                 m0, [srcq+strideq*0]
1431    PSHUFB_BILIN_H8      m0, m4
1432    PMADDUBSW            m0, m5, m7, m4, 0 ; 0
1433.hv_w8_loop:
1434    movu                 m1, [srcq+strideq*1]
1435    lea                srcq, [srcq+strideq*2]
1436    movu                 m2, [srcq+strideq*0]
1437    PSHUFB_BILIN_H8      m1, m4
1438    PSHUFB_BILIN_H8      m2, m4
1439    PMADDUBSW            m1, m5, m7, m4, 0 ; 1
1440    PMADDUBSW            m2, m5, m7, m4, 0 ; 2
1441    psubw                m3, m1, m0
1442    PMULHRSW             m3, m6, m4, m8, 4
1443    paddw                m3, m0
1444    mova                 m0, m2
1445    psubw                m2, m1
1446    PMULHRSW             m2, m6, m4, m8, 4
1447    paddw                m2, m1
1448    mova        [tmpq+16*0], m3
1449    mova        [tmpq+16*1], m2
1450    add                tmpq, 16*2
1451    sub                  hd, 2
1452    jg .hv_w8_loop
1453    RET
1454.hv_w128:
1455    lea                 r3d, [hq+(7<<8)]
1456    mov                 r5d, 256
1457    jmp .hv_w16_start
1458.hv_w64:
1459    lea                 r3d, [hq+(3<<8)]
1460    mov                 r5d, 128
1461    jmp .hv_w16_start
1462.hv_w32:
1463    lea                 r3d, [hq+(1<<8)]
1464    mov                 r5d, 64
1465    jmp .hv_w16_start
1466.hv_w16:
1467    xor                 r3d, r3d
1468    mov                 r5d, 32
1469.hv_w16_start:
1470%if ARCH_X86_64 || cpuflag(ssse3)
1471    mov                  r6, srcq
1472%endif
1473%if ARCH_X86_64
1474 %if WIN64
1475    PUSH                 r7
1476 %endif
1477    mov                  r7, tmpq
1478%endif
1479.hv_w16_hloop:
1480    movu                 m0, [srcq+strideq*0+8*0]
1481    movu                 m1, [srcq+strideq*0+8*1]
1482    PSHUFB_BILIN_H8      m0, m4
1483    PSHUFB_BILIN_H8      m1, m4
1484    PMADDUBSW            m0, m5, m7, m4, 0 ; 0a
1485    PMADDUBSW            m1, m5, m7, m4, 0 ; 0b
1486.hv_w16_vloop:
1487    movu                 m2, [srcq+strideq*1+8*0]
1488    PSHUFB_BILIN_H8      m2, m4
1489    PMADDUBSW            m2, m5, m7, m4, 0 ; 1a
1490    psubw                m3, m2, m0
1491    PMULHRSW             m3, m6, m4, m8, 4
1492    paddw                m3, m0
1493    mova        [tmpq+16*0], m3
1494    movu                 m3, [srcq+strideq*1+8*1]
1495    lea                srcq, [srcq+strideq*2]
1496    PSHUFB_BILIN_H8      m3, m4
1497    PMADDUBSW            m3, m5, m7, m4, 0 ; 1b
1498    psubw                m0, m3, m1
1499    PMULHRSW             m0, m6, m4, m8, 4
1500    paddw                m0, m1
1501    mova        [tmpq+16*1], m0
1502    add                tmpq, r5
1503    movu                 m0, [srcq+strideq*0+8*0]
1504    PSHUFB_BILIN_H8      m0, m4
1505    PMADDUBSW            m0, m5, m7, m4, 0 ; 2a
1506    psubw                m1, m0, m2
1507    PMULHRSW             m1, m6, m4, m8, 4
1508    paddw                m1, m2
1509    mova        [tmpq+16*0], m1
1510    movu                 m1, [srcq+strideq*0+8*1]
1511    PSHUFB_BILIN_H8      m1, m4
1512    PMADDUBSW            m1, m5, m7, m4, 0 ; 2b
1513    psubw                m2, m1, m3
1514    PMULHRSW             m2, m6, m4, m8, 4
1515    paddw                m2, m3
1516    mova        [tmpq+16*1], m2
1517    add                tmpq, r5
1518    sub                  hd, 2
1519    jg .hv_w16_vloop
1520    movzx                hd, r3b
1521%if ARCH_X86_64
1522    add                  r6, 16
1523    add                  r7, 2*16
1524    mov                srcq, r6
1525    mov                tmpq, r7
1526%elif cpuflag(ssse3)
1527    mov                tmpq, tmpm
1528    add                  r6, 16
1529    add                tmpq, 2*16
1530    mov                srcq, r6
1531    mov                tmpm, tmpq
1532%else
1533    mov                srcq, srcm
1534    mov                tmpq, tmpm
1535    add                srcq, 16
1536    add                tmpq, 2*16
1537    mov                srcm, srcq
1538    mov                tmpm, tmpq
1539%endif
1540    sub                 r3d, 1<<8
1541    jg .hv_w16_hloop
1542%if WIN64
1543    POP                  r7
1544%endif
1545    RET
1546%endmacro
1547
1548; int8_t subpel_filters[5][15][8]
1549%assign FILTER_REGULAR (0*15 << 16) | 3*15
1550%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1551%assign FILTER_SHARP   (2*15 << 16) | 3*15
1552
1553%macro FN 4 ; prefix, type, type_h, type_v
1554cglobal %1_%2_8bpc
1555    mov                 t0d, FILTER_%3
1556%ifidn %3, %4
1557    mov                 t1d, t0d
1558%else
1559    mov                 t1d, FILTER_%4
1560%endif
1561%ifnidn %2, regular ; skip the jump in the last filter
1562    jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
1563%endif
1564%endmacro
1565
1566%if ARCH_X86_32
1567DECLARE_REG_TMP 1, 2
1568%elif WIN64
1569DECLARE_REG_TMP 4, 5
1570%else
1571DECLARE_REG_TMP 7, 8
1572%endif
1573
1574FN put_8tap, sharp,          SHARP,   SHARP
1575FN put_8tap, sharp_smooth,   SHARP,   SMOOTH
1576FN put_8tap, smooth_sharp,   SMOOTH,  SHARP
1577FN put_8tap, smooth,         SMOOTH,  SMOOTH
1578FN put_8tap, sharp_regular,  SHARP,   REGULAR
1579FN put_8tap, regular_sharp,  REGULAR, SHARP
1580FN put_8tap, smooth_regular, SMOOTH,  REGULAR
1581FN put_8tap, regular_smooth, REGULAR, SMOOTH
1582FN put_8tap, regular,        REGULAR, REGULAR
1583
1584%if ARCH_X86_32
1585 %define base_reg r1
1586 %define base base_reg-put_ssse3
1587%else
1588 %define base_reg r8
1589 %define base 0
1590%endif
1591
1592cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
1593    imul                mxd, mxm, 0x010101
1594    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
1595%if ARCH_X86_64
1596    imul                myd, mym, 0x010101
1597    add                 myd, t1d ; 8tap_v, my, 4tap_v
1598%else
1599    imul                ssd, mym, 0x010101
1600    add                 ssd, t1d ; 8tap_v, my, 4tap_v
1601    mov                srcq, srcm
1602%endif
1603    mov                  wd, wm
1604    movifnidn            hd, hm
1605    LEA            base_reg, put_ssse3
1606    test                mxd, 0xf00
1607    jnz .h
1608%if ARCH_X86_32
1609    test                ssd, 0xf00
1610%else
1611    test                myd, 0xf00
1612%endif
1613    jnz .v
1614    tzcnt                wd, wd
1615    movzx                wd, word [base_reg+wq*2+table_offset(put,)]
1616    add                  wq, base_reg
1617; put_bilin mangling jump
1618    movifnidn           dsq, dsmp
1619    movifnidn           ssq, ssmp
1620%if WIN64
1621    pop                  r8
1622%endif
1623    lea                  r6, [ssq*3]
1624    jmp                  wq
1625.h:
1626%if ARCH_X86_32
1627    test                ssd, 0xf00
1628%else
1629    test                myd, 0xf00
1630%endif
1631    jnz .hv
1632    movifnidn           ssq, ssmp
1633    WIN64_SPILL_XMM      12
1634    cmp                  wd, 4
1635    jl .h_w2
1636    je .h_w4
1637    tzcnt                wd, wd
1638%if ARCH_X86_64
1639    mova                m10, [base+subpel_h_shufA]
1640    mova                m11, [base+subpel_h_shufB]
1641    mova                 m9, [base+subpel_h_shufC]
1642%endif
1643    shr                 mxd, 16
1644    sub                srcq, 3
1645    movzx                wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
1646    movq                 m6, [base_reg+mxq*8+subpel_filters-put_ssse3]
1647    mova                 m7, [base+pw_34] ; 2 + (8 << 2)
1648    pshufd               m5, m6, q0000
1649    pshufd               m6, m6, q1111
1650    add                  wq, base_reg
1651    jmp                  wq
1652.h_w2:
1653%if ARCH_X86_32
1654    and                 mxd, 0x7f
1655%else
1656    movzx               mxd, mxb
1657%endif
1658    dec                srcq
1659    mova                 m4, [base+subpel_h_shuf4]
1660    movd                 m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
1661    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
1662    pshufd               m3, m3, q0000
1663    movifnidn           dsq, dsmp
1664.h_w2_loop:
1665    movq                 m0, [srcq+ssq*0]
1666    movhps               m0, [srcq+ssq*1]
1667    lea                srcq, [srcq+ssq*2]
1668    pshufb               m0, m4
1669    pmaddubsw            m0, m3
1670    phaddw               m0, m0
1671    paddw                m0, m5 ; pw34
1672    psraw                m0, 6
1673    packuswb             m0, m0
1674    movd                r6d, m0
1675    mov        [dstq+dsq*0], r6w
1676    shr                 r6d, 16
1677    mov        [dstq+dsq*1], r6w
1678    lea                dstq, [dstq+dsq*2]
1679    sub                  hd, 2
1680    jg .h_w2_loop
1681    RET
1682.h_w4:
1683%if ARCH_X86_32
1684    and                 mxd, 0x7f
1685%else
1686    movzx               mxd, mxb
1687%endif
1688    dec                srcq
1689    movd                 m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
1690    mova                 m6, [base+subpel_h_shufA]
1691    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
1692    pshufd               m3, m3, q0000
1693    movifnidn           dsq, dsmp
1694.h_w4_loop:
1695    movq                 m0, [srcq+ssq*0] ; 1
1696    movq                 m1, [srcq+ssq*1] ; 2
1697    lea                srcq, [srcq+ssq*2]
1698    pshufb               m0, m6 ; subpel_h_shufA
1699    pshufb               m1, m6 ; subpel_h_shufA
1700    pmaddubsw            m0, m3 ; subpel_filters
1701    pmaddubsw            m1, m3 ; subpel_filters
1702    phaddw               m0, m1
1703    paddw                m0, m5 ; pw34
1704    psraw                m0, 6
1705    packuswb             m0, m0
1706    movd       [dstq+dsq*0], m0
1707    psrlq                m0, 32
1708    movd       [dstq+dsq*1], m0
1709    lea                dstq, [dstq+dsq*2]
1710    sub                  hd, 2
1711    jg .h_w4_loop
1712    RET
1713%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
1714 %if ARCH_X86_32
1715    pshufb              %2, %1, [base+subpel_h_shufB]
1716    pshufb              %3, %1, [base+subpel_h_shufC]
1717    pshufb              %1,     [base+subpel_h_shufA]
1718 %else
1719    pshufb              %2, %1, m11; subpel_h_shufB
1720    pshufb              %3, %1, m9 ; subpel_h_shufC
1721    pshufb              %1, m10    ; subpel_h_shufA
1722 %endif
1723    pmaddubsw           %4, %2, m5 ; subpel +0 B0
1724    pmaddubsw           %2, m6     ; subpel +4 B4
1725    pmaddubsw           %3, m6     ; C4
1726    pmaddubsw           %1, m5     ; A0
1727    paddw               %3, %4     ; C4+B0
1728    paddw               %1, %2     ; A0+B4
1729    phaddw              %1, %3
1730    paddw               %1, m7     ; pw34
1731    psraw               %1, 6
1732%endmacro
1733.h_w8:
1734    movu                 m0, [srcq+ssq*0]
1735    movu                 m1, [srcq+ssq*1]
1736    lea                srcq, [srcq+ssq*2]
1737    PUT_8TAP_H           m0, m2, m3, m4
1738    PUT_8TAP_H           m1, m2, m3, m4
1739    packuswb             m0, m1
1740%if ARCH_X86_32
1741    movq             [dstq], m0
1742    add                dstq, dsm
1743    movhps           [dstq], m0
1744    add                dstq, dsm
1745%else
1746    movq       [dstq+dsq*0], m0
1747    movhps     [dstq+dsq*1], m0
1748    lea                dstq, [dstq+dsq*2]
1749%endif
1750    sub                  hd, 2
1751    jg .h_w8
1752    RET
1753.h_w128:
1754    mov                  r4, -16*7
1755    jmp .h_w16_start
1756.h_w64:
1757    mov                  r4, -16*3
1758    jmp .h_w16_start
1759.h_w32:
1760    mov                  r4, -16*1
1761    jmp .h_w16_start
1762.h_w16:
1763    xor                 r4d, r4d
1764.h_w16_start:
1765    sub                srcq, r4
1766    sub                dstq, r4
1767.h_w16_loop_v:
1768    mov                  r6, r4
1769.h_w16_loop_h:
1770    movu                 m0, [srcq+r6+8*0]
1771    movu                 m1, [srcq+r6+8*1]
1772    PUT_8TAP_H           m0, m2, m3, m4
1773    PUT_8TAP_H           m1, m2, m3, m4
1774    packuswb             m0, m1
1775    mova          [dstq+r6], m0
1776    add                  r6, 16
1777    jle .h_w16_loop_h
1778    add                srcq, ssq
1779    add                dstq, dsmp
1780    dec                  hd
1781    jg .h_w16_loop_v
1782    RET
1783.v:
1784%if ARCH_X86_32
1785    movzx               mxd, ssb
1786    shr                 ssd, 16
1787    cmp                  hd, 6
1788    cmovs               ssd, mxd
1789    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
1790%else
1791    WIN64_SPILL_XMM      16
1792    movzx               mxd, myb
1793    shr                 myd, 16
1794    cmp                  hd, 6
1795    cmovs               myd, mxd
1796    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
1797%endif
1798    tzcnt               r6d, wd
1799    movzx               r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
1800    punpcklwd            m0, m0
1801    mova                 m7, [base+pw_512]
1802    add                  r6, base_reg
1803%if ARCH_X86_32
1804 %define            subpel0  [rsp+mmsize*0]
1805 %define            subpel1  [rsp+mmsize*1]
1806 %define            subpel2  [rsp+mmsize*2]
1807 %define            subpel3  [rsp+mmsize*3]
1808%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
1809    ALLOC_STACK       -16*4
1810%assign regs_used 7
1811    pshufd               m1, m0, q0000
1812    mova            subpel0, m1
1813    pshufd               m1, m0, q1111
1814    mova            subpel1, m1
1815    pshufd               m1, m0, q2222
1816    mova            subpel2, m1
1817    pshufd               m1, m0, q3333
1818    mova            subpel3, m1
1819    mov                 ssq, [rstk+stack_offset+gprsize*4]
1820    lea                 ssq, [ssq*3]
1821    sub                srcq, ssq
1822    mov                 ssq, [rstk+stack_offset+gprsize*4]
1823    mov                 dsq, [rstk+stack_offset+gprsize*2]
1824%else
1825 %define            subpel0  m8
1826 %define            subpel1  m9
1827 %define            subpel2  m10
1828 %define            subpel3  m11
1829    lea                ss3q, [ssq*3]
1830    pshufd               m8, m0, q0000
1831    sub                srcq, ss3q
1832    pshufd               m9, m0, q1111
1833    pshufd              m10, m0, q2222
1834    pshufd              m11, m0, q3333
1835%endif
1836    jmp                  r6
1837.v_w2:
1838    movd                 m1, [srcq+ssq*0]
1839    movd                 m0, [srcq+ssq*1]
1840%if ARCH_X86_32
1841    lea                srcq, [srcq+ssq*2]
1842    movd                 m2, [srcq+ssq*0]
1843    movd                 m5, [srcq+ssq*1]
1844    lea                srcq, [srcq+ssq*2]
1845    movd                 m3, [srcq+ssq*0]
1846    movd                 m4, [srcq+ssq*1]
1847    lea                srcq, [srcq+ssq*2]
1848%else
1849    movd                 m2, [srcq+ssq*2]
1850    add                srcq, ss3q
1851    movd                 m5, [srcq+ssq*0]
1852    movd                 m3, [srcq+ssq*1]
1853    movd                 m4, [srcq+ssq*2]
1854    add                srcq, ss3q
1855%endif
1856    punpcklwd            m1, m0           ; 0 1
1857    punpcklwd            m0, m2           ; 1 2
1858    punpcklbw            m1, m0           ; 01 12
1859    movd                 m0, [srcq+ssq*0]
1860    punpcklwd            m2, m5           ; 2 3
1861    punpcklwd            m5, m3           ; 3 4
1862    punpcklwd            m3, m4           ; 4 5
1863    punpcklwd            m4, m0           ; 5 6
1864    punpcklbw            m2, m5           ; 23 34
1865    punpcklbw            m3, m4           ; 45 56
1866.v_w2_loop:
1867    movd                 m4, [srcq+ssq*1]
1868    lea                srcq, [srcq+ssq*2]
1869    pmaddubsw            m5, m1, subpel0     ; a0 b0
1870    mova                 m1, m2
1871    pmaddubsw            m2, subpel1         ; a1 b1
1872    paddw                m5, m2
1873    mova                 m2, m3
1874    pmaddubsw            m3, subpel2         ; a2 b2
1875    paddw                m5, m3
1876    punpcklwd            m3, m0, m4          ; 6 7
1877    movd                 m0, [srcq+ssq*0]
1878    punpcklwd            m4, m0              ; 7 8
1879    punpcklbw            m3, m4              ; 67 78
1880    pmaddubsw            m4, m3, subpel3     ; a3 b3
1881    paddw                m5, m4
1882    pmulhrsw             m5, m7
1883    packuswb             m5, m5
1884    movd                r6d, m5
1885    mov        [dstq+dsq*0], r6w
1886    shr                 r6d, 16
1887    mov        [dstq+dsq*1], r6w
1888    lea                dstq, [dstq+dsq*2]
1889    sub                  hd, 2
1890    jg .v_w2_loop
1891    RET
1892.v_w4:
1893%if ARCH_X86_32
1894.v_w8:
1895.v_w16:
1896.v_w32:
1897.v_w64:
1898.v_w128:
1899    shl                  wd, 14
1900%if STACK_ALIGNMENT < 16
1901 %define               dstm [rsp+mmsize*4+gprsize]
1902    mov                dstm, dstq
1903%endif
1904    lea                 r6d, [hq+wq-(1<<16)]
1905    mov                  r4, srcq
1906.v_w4_loop0:
1907%endif
1908    movd                 m1, [srcq+ssq*0]
1909    movd                 m0, [srcq+ssq*1]
1910%if ARCH_X86_32
1911    lea                srcq, [srcq+ssq*2]
1912    movd                 m2, [srcq+ssq*0]
1913    movd                 m5, [srcq+ssq*1]
1914    lea                srcq, [srcq+ssq*2]
1915    movd                 m3, [srcq+ssq*0]
1916    movd                 m4, [srcq+ssq*1]
1917    lea                srcq, [srcq+ssq*2]
1918%else
1919    movd                 m2, [srcq+ssq*2]
1920    add                srcq, ss3q
1921    movd                 m5, [srcq+ssq*0]
1922    movd                 m3, [srcq+ssq*1]
1923    movd                 m4, [srcq+ssq*2]
1924    add                srcq, ss3q
1925%endif
1926    punpckldq            m1, m0           ; 0 1
1927    punpckldq            m0, m2           ; 1 2
1928    punpcklbw            m1, m0           ; 01 12
1929    movd                 m0, [srcq+ssq*0]
1930    punpckldq            m2, m5           ; 2 3
1931    punpckldq            m5, m3           ; 3 4
1932    punpckldq            m3, m4           ; 4 5
1933    punpckldq            m4, m0           ; 5 6
1934    punpcklbw            m2, m5           ; 23 34
1935    punpcklbw            m3, m4           ; 45 56
1936.v_w4_loop:
1937    movd                 m4, [srcq+ssq*1]
1938    lea                srcq, [srcq+ssq*2]
1939    pmaddubsw            m5, m1, subpel0  ; a0 b0
1940    mova                 m1, m2
1941    pmaddubsw            m2, subpel1      ; a1 b1
1942    paddw                m5, m2
1943    mova                 m2, m3
1944    pmaddubsw            m3, subpel2      ; a2 b2
1945    paddw                m5, m3
1946    punpckldq            m3, m0, m4       ; 6 7 _ _
1947    movd                 m0, [srcq+ssq*0]
1948    punpckldq            m4, m0           ; 7 8 _ _
1949    punpcklbw            m3, m4           ; 67 78
1950    pmaddubsw            m4, m3, subpel3  ; a3 b3
1951    paddw                m5, m4
1952    pmulhrsw             m5, m7
1953    packuswb             m5, m5
1954    movd       [dstq+dsq*0], m5
1955    psrlq                m5, 32
1956    movd       [dstq+dsq*1], m5
1957    lea                dstq, [dstq+dsq*2]
1958    sub                  hd, 2
1959    jg .v_w4_loop
1960%if ARCH_X86_32
1961    mov                dstq, dstm
1962    add                  r4, 4
1963    movzx                hd, r6w
1964    add                dstq, 4
1965    mov                srcq, r4
1966    mov                dstm, dstq
1967    sub                 r6d, 1<<16
1968    jg .v_w4_loop0
1969%endif
1970    RET
1971%if ARCH_X86_64
1972.v_w8:
1973.v_w16:
1974.v_w32:
1975.v_w64:
1976.v_w128:
1977    lea                 r6d, [wq*8-64]
1978    mov                  r4, srcq
1979    mov                  r7, dstq
1980    lea                 r6d, [hq+r6*4]
1981.v_w8_loop0:
1982    movq                 m1, [srcq+ssq*0]
1983    movq                 m2, [srcq+ssq*1]
1984    movq                 m3, [srcq+ssq*2]
1985    add                srcq, ss3q
1986    movq                 m4, [srcq+ssq*0]
1987    movq                 m5, [srcq+ssq*1]
1988    movq                 m6, [srcq+ssq*2]
1989    add                srcq, ss3q
1990    movq                 m0, [srcq+ssq*0]
1991    punpcklbw            m1, m2 ; 01
1992    punpcklbw            m2, m3 ; 12
1993    punpcklbw            m3, m4 ; 23
1994    punpcklbw            m4, m5 ; 34
1995    punpcklbw            m5, m6 ; 45
1996    punpcklbw            m6, m0 ; 56
1997.v_w8_loop:
1998    movq                m13, [srcq+ssq*1]
1999    lea                srcq, [srcq+ssq*2]
2000    pmaddubsw           m14, m1, subpel0 ; a0
2001    mova                 m1, m3
2002    pmaddubsw           m15, m2, subpel0 ; b0
2003    mova                 m2, m4
2004    pmaddubsw            m3, subpel1 ; a1
2005    mova                m12, m0
2006    pmaddubsw            m4, subpel1 ; b1
2007    movq                 m0, [srcq+ssq*0]
2008    paddw               m14, m3
2009    paddw               m15, m4
2010    mova                 m3, m5
2011    pmaddubsw            m5, subpel2 ; a2
2012    mova                 m4, m6
2013    pmaddubsw            m6, subpel2 ; b2
2014    punpcklbw           m12, m13     ; 67
2015    punpcklbw           m13, m0      ; 78
2016    paddw               m14, m5
2017    mova                 m5, m12
2018    pmaddubsw           m12, subpel3 ; a3
2019    paddw               m15, m6
2020    mova                 m6, m13
2021    pmaddubsw           m13, subpel3 ; b3
2022    paddw               m14, m12
2023    paddw               m15, m13
2024    pmulhrsw            m14, m7
2025    pmulhrsw            m15, m7
2026    packuswb            m14, m15
2027    movq       [dstq+dsq*0], m14
2028    movhps     [dstq+dsq*1], m14
2029    lea                dstq, [dstq+dsq*2]
2030    sub                  hd, 2
2031    jg .v_w8_loop
2032    add                  r4, 8
2033    add                  r7, 8
2034    movzx                hd, r6b
2035    mov                srcq, r4
2036    mov                dstq, r7
2037    sub                 r6d, 1<<8
2038    jg .v_w8_loop0
2039    RET
2040%endif ;ARCH_X86_64
2041%undef subpel0
2042%undef subpel1
2043%undef subpel2
2044%undef subpel3
2045.hv:
2046    RESET_STACK_STATE
2047    cmp                  wd, 4
2048    jg .hv_w8
2049%if ARCH_X86_32
2050    and                 mxd, 0x7f
2051%else
2052    movzx               mxd, mxb
2053%endif
2054    dec                srcq
2055    movd                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
2056%if ARCH_X86_32
2057    movzx               mxd, ssb
2058    shr                 ssd, 16
2059    cmp                  hd, 6
2060    cmovs               ssd, mxd
2061    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
2062    mov                 ssq, ssmp
2063    lea                  r6, [ssq*3]
2064    sub                srcq, r6
2065 %define           base_reg  r6
2066    mov                  r6, r1; use as new base
2067 %assign regs_used 2
2068    ALLOC_STACK  -mmsize*14
2069 %assign regs_used 7
2070    mov                 dsq, [rstk+stack_offset+gprsize*2]
2071 %define           subpelv0  [rsp+mmsize*0]
2072 %define           subpelv1  [rsp+mmsize*1]
2073 %define           subpelv2  [rsp+mmsize*2]
2074 %define           subpelv3  [rsp+mmsize*3]
2075    punpcklbw            m0, m0
2076    psraw                m0, 8 ; sign-extend
2077    pshufd               m6, m0, q0000
2078    mova           subpelv0, m6
2079    pshufd               m6, m0, q1111
2080    mova           subpelv1, m6
2081    pshufd               m6, m0, q2222
2082    mova           subpelv2, m6
2083    pshufd               m6, m0, q3333
2084    mova           subpelv3, m6
2085%else
2086    movzx               mxd, myb
2087    shr                 myd, 16
2088    cmp                  hd, 6
2089    cmovs               myd, mxd
2090    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
2091    ALLOC_STACK   mmsize*14, 14
2092    lea                ss3q, [ssq*3]
2093    sub                srcq, ss3q
2094 %define           subpelv0  m10
2095 %define           subpelv1  m11
2096 %define           subpelv2  m12
2097 %define           subpelv3  m13
2098    punpcklbw            m0, m0
2099    psraw                m0, 8 ; sign-extend
2100    mova                 m8, [base+pw_8192]
2101    mova                 m9, [base+pd_512]
2102    pshufd              m10, m0, q0000
2103    pshufd              m11, m0, q1111
2104    pshufd              m12, m0, q2222
2105    pshufd              m13, m0, q3333
2106%endif
2107    pshufd               m7, m1, q0000
2108    cmp                  wd, 4
2109    je .hv_w4
2110.hv_w2:
2111    mova                 m6, [base+subpel_h_shuf4]
2112    movq                 m2, [srcq+ssq*0]     ; 0
2113    movhps               m2, [srcq+ssq*1]     ; 0 _ 1
2114%if ARCH_X86_32
2115 %define           w8192reg  [base+pw_8192]
2116 %define            d512reg  [base+pd_512]
2117    lea                srcq, [srcq+ssq*2]
2118    movq                 m0, [srcq+ssq*0]     ; 2
2119    movhps               m0, [srcq+ssq*1]     ; 2 _ 3
2120    lea                srcq, [srcq+ssq*2]
2121%else
2122 %define           w8192reg  m8
2123 %define            d512reg  m9
2124    movq                 m0, [srcq+ssq*2]     ; 2
2125    add                srcq, ss3q
2126    movhps               m0, [srcq+ssq*0]     ; 2 _ 3
2127%endif
2128    pshufb               m2, m6 ; 0 ~ 1 ~
2129    pshufb               m0, m6 ; 2 ~ 3 ~
2130    pmaddubsw            m2, m7 ; subpel_filters
2131    pmaddubsw            m0, m7 ; subpel_filters
2132    phaddw               m2, m0 ; 0 1 2 3
2133    pmulhrsw             m2, w8192reg
2134%if ARCH_X86_32
2135    movq                 m3, [srcq+ssq*0]     ; 4
2136    movhps               m3, [srcq+ssq*1]     ; 4 _ 5
2137    lea                srcq, [srcq+ssq*2]
2138%else
2139    movq                 m3, [srcq+ssq*1]     ; 4
2140    movhps               m3, [srcq+ssq*2]     ; 4 _ 5
2141    add                srcq, ss3q
2142%endif
2143    movq                 m0, [srcq+ssq*0]     ; 6
2144    pshufb               m3, m6 ; 4 ~ 5 ~
2145    pshufb               m0, m6 ; 6 ~
2146    pmaddubsw            m3, m7 ; subpel_filters
2147    pmaddubsw            m0, m7 ; subpel_filters
2148    phaddw               m3, m0 ; 4 5 6 _
2149    pmulhrsw             m3, w8192reg
2150    palignr              m4, m3, m2, 4; V        1 2 3 4
2151    punpcklwd            m1, m2, m4   ; V 01 12    0 1 1 2
2152    punpckhwd            m2, m4       ; V 23 34    2 3 3 4
2153    pshufd               m0, m3, q2121; V          5 6 5 6
2154    punpcklwd            m3, m0       ; V 45 56    4 5 5 6
2155.hv_w2_loop:
2156    movq                 m4, [srcq+ssq*1] ; V 7
2157    lea                srcq, [srcq+ssq*2] ; V
2158    movhps               m4, [srcq+ssq*0] ; V 7 8
2159    pshufb               m4, m6
2160    pmaddubsw            m4, m7
2161    pmaddwd              m5, m1, subpelv0; V a0 b0
2162    mova                 m1, m2       ; V
2163    pmaddwd              m2, subpelv1 ; V a1 b1
2164    paddd                m5, m2       ; V
2165    mova                 m2, m3       ; V
2166    pmaddwd              m3, subpelv2 ; a2 b2
2167    phaddw               m4, m4
2168    pmulhrsw             m4, w8192reg
2169    paddd                m5, m3       ; V
2170    palignr              m3, m4, m0, 12
2171    mova                 m0, m4
2172    punpcklwd            m3, m0           ; V 67 78
2173    pmaddwd              m4, m3, subpelv3 ; V a3 b3
2174    paddd                m5, d512reg
2175    paddd                m5, m4
2176    psrad                m5, 10
2177    packssdw             m5, m5
2178    packuswb             m5, m5
2179    movd                r4d, m5
2180    mov        [dstq+dsq*0], r4w
2181    shr                 r4d, 16
2182    mov        [dstq+dsq*1], r4w
2183    lea                dstq, [dstq+dsq*2]
2184    sub                  hd, 2
2185    jg .hv_w2_loop
2186    RET
2187%undef w8192reg
2188%undef d512reg
2189.hv_w4:
2190%define hv4_line_0_0 4
2191%define hv4_line_0_1 5
2192%define hv4_line_0_2 6
2193%define hv4_line_0_3 7
2194%define hv4_line_0_4 8
2195%define hv4_line_0_5 9
2196%define hv4_line_1_0 10
2197%define hv4_line_1_1 11
2198%define hv4_line_1_2 12
2199%define hv4_line_1_3 13
2200%macro SAVELINE_W4 3
2201    mova     [rsp+mmsize*hv4_line_%3_%2], %1
2202%endmacro
2203%macro RESTORELINE_W4 3
2204    mova     %1, [rsp+mmsize*hv4_line_%3_%2]
2205%endmacro
2206%if ARCH_X86_32
2207 %define           w8192reg  [base+pw_8192]
2208 %define            d512reg  [base+pd_512]
2209%else
2210 %define           w8192reg  m8
2211 %define            d512reg  m9
2212%endif
2213    ; lower shuffle 0 1 2 3 4
2214    mova                 m6, [base+subpel_h_shuf4]
2215    movq                 m5, [srcq+ssq*0]   ; 0 _ _ _
2216    movhps               m5, [srcq+ssq*1]   ; 0 _ 1 _
2217%if ARCH_X86_32
2218    lea                srcq, [srcq+ssq*2]
2219    movq                 m4, [srcq+ssq*0]   ; 2 _ _ _
2220    movhps               m4, [srcq+ssq*1]   ; 2 _ 3 _
2221    lea                srcq, [srcq+ssq*2]
2222%else
2223    movq                 m4, [srcq+ssq*2]   ; 2 _ _ _
2224    movhps               m4, [srcq+ss3q ]   ; 2 _ 3 _
2225    lea                srcq, [srcq+ssq*4]
2226%endif
2227    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
2228    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
2229    pmaddubsw            m2, m7 ;H subpel_filters
2230    pmaddubsw            m0, m7 ;H subpel_filters
2231    phaddw               m2, m0 ;H 0 1 2 3
2232    pmulhrsw             m2, w8192reg ;H pw_8192
2233    SAVELINE_W4          m2, 2, 0
2234    ; upper shuffle 2 3 4 5 6
2235    mova                 m6, [base+subpel_h_shuf4+16]
2236    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
2237    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
2238    pmaddubsw            m2, m7 ;H subpel_filters
2239    pmaddubsw            m0, m7 ;H subpel_filters
2240    phaddw               m2, m0 ;H 0 1 2 3
2241    pmulhrsw             m2, w8192reg ;H pw_8192
2242    ;
2243    ; lower shuffle
2244    mova                 m6, [base+subpel_h_shuf4]
2245    movq                 m5, [srcq+ssq*0]   ; 4 _ _ _
2246    movhps               m5, [srcq+ssq*1]   ; 4 _ 5 _
2247%if ARCH_X86_32
2248    lea                srcq, [srcq+ssq*2]
2249    movq                 m4, [srcq+ssq*0]   ; 6 _ _ _
2250    add                srcq, ssq
2251%else
2252    movq                 m4, [srcq+ssq*2]   ; 6 _ _ _
2253    add                srcq, ss3q
2254%endif
2255    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
2256    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
2257    pmaddubsw            m3, m7 ;H subpel_filters
2258    pmaddubsw            m0, m7 ;H subpel_filters
2259    phaddw               m3, m0 ;H 4 5 6 7
2260    pmulhrsw             m3, w8192reg ;H pw_8192
2261    SAVELINE_W4          m3, 3, 0
2262    ; upper shuffle
2263    mova                 m6, [base+subpel_h_shuf4+16]
2264    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
2265    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
2266    pmaddubsw            m3, m7 ;H subpel_filters
2267    pmaddubsw            m0, m7 ;H subpel_filters
2268    phaddw               m3, m0 ;H 4 5 6 7
2269    pmulhrsw             m3, w8192reg ;H pw_8192
2270    ;process high
2271    palignr              m4, m3, m2, 4;V 1 2 3 4
2272    punpcklwd            m1, m2, m4  ; V 01 12
2273    punpckhwd            m2, m4      ; V 23 34
2274    pshufd               m0, m3, q2121;V 5 6 5 6
2275    punpcklwd            m3, m0      ; V 45 56
2276    SAVELINE_W4          m0, 0, 1
2277    SAVELINE_W4          m1, 1, 1
2278    SAVELINE_W4          m2, 2, 1
2279    SAVELINE_W4          m3, 3, 1
2280    ;process low
2281    RESTORELINE_W4       m2, 2, 0
2282    RESTORELINE_W4       m3, 3, 0
2283    palignr              m4, m3, m2, 4;V 1 2 3 4
2284    punpcklwd            m1, m2, m4  ; V 01 12
2285    punpckhwd            m2, m4      ; V 23 34
2286    pshufd               m0, m3, q2121;V 5 6 5 6
2287    punpcklwd            m3, m0      ; V 45 56
2288.hv_w4_loop:
2289    ;process low
2290    pmaddwd              m5, m1, subpelv0 ; V a0 b0
2291    mova                 m1, m2
2292    pmaddwd              m2, subpelv1; V a1 b1
2293    paddd                m5, m2
2294    mova                 m2, m3
2295    pmaddwd              m3, subpelv2; V a2 b2
2296    paddd                m5, m3
2297    mova                 m6, [base+subpel_h_shuf4]
2298    movq                 m4, [srcq+ssq*0] ; 7
2299    movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
2300    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
2301    pmaddubsw            m4, m7 ;H subpel_filters
2302    phaddw               m4, m4 ;H                7 8 7 8
2303    pmulhrsw             m4, w8192reg ;H pw_8192
2304    palignr              m3, m4, m0, 12         ; 6 7 8 7
2305    mova                 m0, m4
2306    punpcklwd            m3, m4      ; 67 78
2307    pmaddwd              m4, m3, subpelv3; a3 b3
2308    paddd                m5, d512reg ; pd_512
2309    paddd                m5, m4
2310    psrad                m5, 10
2311    SAVELINE_W4          m0, 0, 0
2312    SAVELINE_W4          m1, 1, 0
2313    SAVELINE_W4          m2, 2, 0
2314    SAVELINE_W4          m3, 3, 0
2315    SAVELINE_W4          m5, 5, 0
2316    ;process high
2317    RESTORELINE_W4       m0, 0, 1
2318    RESTORELINE_W4       m1, 1, 1
2319    RESTORELINE_W4       m2, 2, 1
2320    RESTORELINE_W4       m3, 3, 1
2321    pmaddwd              m5, m1, subpelv0; V a0 b0
2322    mova                 m1, m2
2323    pmaddwd              m2, subpelv1; V a1 b1
2324    paddd                m5, m2
2325    mova                 m2, m3
2326    pmaddwd              m3, subpelv2; V a2 b2
2327    paddd                m5, m3
2328    mova                 m6, [base+subpel_h_shuf4+16]
2329    movq                 m4, [srcq+ssq*0] ; 7
2330    movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
2331    lea                srcq, [srcq+ssq*2]
2332    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
2333    pmaddubsw            m4, m7 ;H subpel_filters
2334    phaddw               m4, m4 ;H                7 8 7 8
2335    pmulhrsw             m4, w8192reg ;H pw_8192
2336    palignr              m3, m4, m0, 12         ; 6 7 8 7
2337    mova                 m0, m4
2338    punpcklwd            m3, m4      ; 67 78
2339    pmaddwd              m4, m3, subpelv3; a3 b3
2340    paddd                m5, d512reg ; pd_512
2341    paddd                m5, m4
2342    psrad                m4, m5, 10
2343    RESTORELINE_W4       m5, 5, 0
2344    packssdw             m5, m4 ; d -> w
2345    packuswb             m5, m5 ; w -> b
2346    pshuflw              m5, m5, q3120
2347    movd       [dstq+dsq*0], m5
2348    psrlq                m5, 32
2349    movd       [dstq+dsq*1], m5
2350    lea                dstq, [dstq+dsq*2]
2351    sub                  hd, 2
2352    SAVELINE_W4          m0, 0, 1
2353    SAVELINE_W4          m1, 1, 1
2354    SAVELINE_W4          m2, 2, 1
2355    SAVELINE_W4          m3, 3, 1
2356    RESTORELINE_W4       m0, 0, 0
2357    RESTORELINE_W4       m1, 1, 0
2358    RESTORELINE_W4       m2, 2, 0
2359    RESTORELINE_W4       m3, 3, 0
2360    jg .hv_w4_loop
2361    RET
2362%undef subpelv0
2363%undef subpelv1
2364%undef subpelv2
2365%undef subpelv3
2366.hv_w8:
2367    RESET_STACK_STATE
2368%define hv8_line_1 0
2369%define hv8_line_2 1
2370%define hv8_line_3 2
2371%define hv8_line_4 3
2372%define hv8_line_6 4
2373%macro SAVELINE_W8 2
2374    mova     [rsp+hv8_line_%1*mmsize], %2
2375%endmacro
2376%macro RESTORELINE_W8 2
2377    mova     %2, [rsp+hv8_line_%1*mmsize]
2378%endmacro
2379    shr                 mxd, 16
2380    sub                srcq, 3
2381%if ARCH_X86_32
2382 %define           base_reg  r1
2383 %define           subpelh0  [rsp+mmsize*5]
2384 %define           subpelh1  [rsp+mmsize*6]
2385 %define           subpelv0  [rsp+mmsize*7]
2386 %define           subpelv1  [rsp+mmsize*8]
2387 %define           subpelv2  [rsp+mmsize*9]
2388 %define           subpelv3  [rsp+mmsize*10]
2389 %define             accuv0  [rsp+mmsize*11]
2390 %define             accuv1  [rsp+mmsize*12]
2391    movq                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
2392    movzx               mxd, ssb
2393    shr                 ssd, 16
2394    cmp                  hd, 6
2395    cmovs               ssd, mxd
2396    movq                 m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
2397    mov                 ssq, ssmp
2398    ALLOC_STACK  -mmsize*13
2399%if STACK_ALIGNMENT < 16
2400 %define               dstm  [rsp+mmsize*13+gprsize*1]
2401 %define                dsm  [rsp+mmsize*13+gprsize*2]
2402    mov                  r6, [rstk+stack_offset+gprsize*2]
2403    mov                 dsm, r6
2404%endif
2405    pshufd               m0, m1, q0000
2406    pshufd               m1, m1, q1111
2407    punpcklbw            m5, m5
2408    psraw                m5, 8 ; sign-extend
2409    pshufd               m2, m5, q0000
2410    pshufd               m3, m5, q1111
2411    pshufd               m4, m5, q2222
2412    pshufd               m5, m5, q3333
2413    mova           subpelh0, m0
2414    mova           subpelh1, m1
2415    mova           subpelv0, m2
2416    mova           subpelv1, m3
2417    mova           subpelv2, m4
2418    mova           subpelv3, m5
2419    lea                  r6, [ssq*3]
2420    mov                dstm, dstq
2421    sub                srcq, r6
2422%else
2423    ALLOC_STACK        16*5, 16
2424 %define           subpelh0  m10
2425 %define           subpelh1  m11
2426 %define           subpelv0  m12
2427 %define           subpelv1  m13
2428 %define           subpelv2  m14
2429 %define           subpelv3  m15
2430 %define             accuv0  m8
2431 %define             accuv1  m9
2432    movq                 m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
2433    movzx               mxd, myb
2434    shr                 myd, 16
2435    cmp                  hd, 6
2436    cmovs               myd, mxd
2437    movq                 m1, [base_reg+myq*8+subpel_filters-put_ssse3]
2438    pshufd         subpelh0, m0, q0000
2439    pshufd         subpelh1, m0, q1111
2440    punpcklbw            m1, m1
2441    psraw                m1, 8 ; sign-extend
2442    pshufd         subpelv0, m1, q0000
2443    pshufd         subpelv1, m1, q1111
2444    pshufd         subpelv2, m1, q2222
2445    pshufd         subpelv3, m1, q3333
2446    lea                ss3q, [ssq*3]
2447    mov                  r7, dstq
2448    sub                srcq, ss3q
2449%endif
2450    shl                  wd, 14
2451    lea                 r6d, [hq+wq-(1<<16)]
2452    mov                  r4, srcq
2453.hv_w8_loop0:
2454    movu                 m4, [srcq+ssq*0] ; 0 = _ _
2455    movu                 m5, [srcq+ssq*1] ; 1 = _ _
2456%if ARCH_X86_32
2457    lea                srcq, [srcq+ssq*2]
2458%endif
2459%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
2460 %if ARCH_X86_32
2461    pshufb               %3, %1, [base+subpel_h_shufB]
2462    pshufb               %4, %1, [base+subpel_h_shufC]
2463    pshufb               %1,     [base+subpel_h_shufA]
2464 %else
2465    pshufb               %3, %1, %6  ; subpel_h_shufB
2466    pshufb               %4, %1, %7  ; subpel_h_shufC
2467    pshufb               %1, %5      ; subpel_h_shufA
2468 %endif
2469    pmaddubsw            %2, %3, subpelh0 ; subpel +0 C0
2470    pmaddubsw            %4, subpelh1; subpel +4 B4
2471    pmaddubsw            %3, subpelh1; C4
2472    pmaddubsw            %1, subpelh0; A0
2473    paddw                %2, %4      ; C0+B4
2474    paddw                %1, %3      ; A0+C4
2475    phaddw               %1, %2
2476%endmacro
2477%if ARCH_X86_64
2478    mova                 m7, [base+subpel_h_shufA]
2479    mova                 m8, [base+subpel_h_shufB]
2480    mova                 m9, [base+subpel_h_shufC]
2481%endif
2482    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
2483    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
2484%if ARCH_X86_32
2485    movu                 m6, [srcq+ssq*0] ; 2 = _ _
2486    movu                 m0, [srcq+ssq*1] ; 3 = _ _
2487    lea                srcq, [srcq+ssq*2]
2488%else
2489    movu                 m6, [srcq+ssq*2] ; 2 = _ _
2490    add                srcq, ss3q
2491    movu                 m0, [srcq+ssq*0] ; 3 = _ _
2492%endif
2493    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
2494    HV_H_W8              m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
2495    mova                 m7, [base+pw_8192]
2496    pmulhrsw             m4, m7 ; H pw_8192
2497    pmulhrsw             m5, m7 ; H pw_8192
2498    pmulhrsw             m6, m7 ; H pw_8192
2499    pmulhrsw             m0, m7 ; H pw_8192
2500    punpcklwd            m1, m4, m5  ; 0 1 ~
2501    punpcklwd            m2, m5, m6  ; 1 2 ~
2502    punpcklwd            m3, m6, m0  ; 2 3 ~
2503    SAVELINE_W8           1, m1
2504    SAVELINE_W8           2, m2
2505    SAVELINE_W8           3, m3
2506    mova                 m7, [base+subpel_h_shufA]
2507%if ARCH_X86_32
2508    movu                 m4, [srcq+ssq*0]       ; 4 = _ _
2509    movu                 m5, [srcq+ssq*1]       ; 5 = _ _
2510    lea                srcq, [srcq+ssq*2]
2511%else
2512    movu                 m4, [srcq+ssq*1]       ; 4 = _ _
2513    movu                 m5, [srcq+ssq*2]       ; 5 = _ _
2514    add                srcq, ss3q
2515%endif
2516    movu                 m6, [srcq+ssq*0]       ; 6 = _ _
2517    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
2518    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
2519    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
2520    mova                 m7, [base+pw_8192]
2521    pmulhrsw             m1, m4, m7 ; H pw_8192 4 ~
2522    pmulhrsw             m2, m5, m7 ; H pw_8192 5 ~
2523    pmulhrsw             m3, m6, m7 ; H pw_8192 6 ~
2524    punpcklwd            m4, m0, m1  ; 3 4 ~
2525    punpcklwd            m5, m1, m2  ; 4 5 ~
2526    punpcklwd            m6, m2, m3  ; 5 6 ~
2527    SAVELINE_W8           6, m3
2528    RESTORELINE_W8        1, m1
2529    RESTORELINE_W8        2, m2
2530    RESTORELINE_W8        3, m3
2531.hv_w8_loop:
2532    ; m8 accu for V a
2533    ; m9 accu for V b
2534    SAVELINE_W8           1, m3
2535    SAVELINE_W8           2, m4
2536    SAVELINE_W8           3, m5
2537    SAVELINE_W8           4, m6
2538%if ARCH_X86_32
2539    pmaddwd              m0, m1, subpelv0 ; a0
2540    pmaddwd              m7, m2, subpelv0 ; b0
2541    pmaddwd              m3, subpelv1     ; a1
2542    pmaddwd              m4, subpelv1     ; b1
2543    paddd                m0, m3
2544    paddd                m7, m4
2545    pmaddwd              m5, subpelv2     ; a2
2546    pmaddwd              m6, subpelv2     ; b2
2547    paddd                m0, m5
2548    paddd                m7, m6
2549    mova                 m5, [base+pd_512]
2550    paddd                m0, m5 ;   pd_512
2551    paddd                m7, m5 ;   pd_512
2552    mova             accuv0, m0
2553    mova             accuv1, m7
2554%else
2555    pmaddwd              m8, m1, subpelv0 ; a0
2556    pmaddwd              m9, m2, subpelv0 ; b0
2557    pmaddwd              m3, subpelv1     ; a1
2558    pmaddwd              m4, subpelv1     ; b1
2559    paddd                m8, m3
2560    paddd                m9, m4
2561    pmaddwd              m5, subpelv2     ; a2
2562    pmaddwd              m6, subpelv2     ; b2
2563    paddd                m8, m5
2564    paddd                m9, m6
2565    mova                 m7, [base+pd_512]
2566    paddd                m8, m7 ;   pd_512
2567    paddd                m9, m7 ;   pd_512
2568    mova                 m7, [base+subpel_h_shufB]
2569    mova                 m6, [base+subpel_h_shufC]
2570    mova                 m5, [base+subpel_h_shufA]
2571%endif
2572    movu                 m0, [srcq+ssq*1] ; 7
2573    movu                 m4, [srcq+ssq*2] ; 8
2574    lea                srcq, [srcq+ssq*2]
2575    HV_H_W8              m0, m1, m2, m3, m5, m7, m6
2576    HV_H_W8              m4, m1, m2, m3, m5, m7, m6
2577    mova                 m5, [base+pw_8192]
2578    pmulhrsw             m0, m5 ; H pw_8192
2579    pmulhrsw             m4, m5 ; H pw_8192
2580    RESTORELINE_W8        6, m6
2581    punpcklwd            m5, m6, m0  ; 6 7  ~
2582    punpcklwd            m6, m0, m4  ; 7 8 ~
2583    pmaddwd              m1, m5, subpelv3 ; a3
2584    paddd                m2, m1, accuv0
2585    pmaddwd              m1, m6, subpelv3 ; b3
2586    paddd                m1, m1, accuv1 ; H + V
2587    psrad                m2, 10
2588    psrad                m1, 10
2589    packssdw             m2, m1  ; d -> w
2590    packuswb             m2, m1 ; w -> b
2591    movd       [dstq+dsq*0], m2
2592    psrlq                m2, 32
2593%if ARCH_X86_32
2594    add                dstq, dsm
2595    movd       [dstq+dsq*0], m2
2596    add                dstq, dsm
2597%else
2598    movd       [dstq+dsq*1], m2
2599    lea                dstq, [dstq+dsq*2]
2600%endif
2601    sub                  hd, 2
2602    jle .hv_w8_outer
2603    SAVELINE_W8           6, m4
2604    RESTORELINE_W8        1, m1
2605    RESTORELINE_W8        2, m2
2606    RESTORELINE_W8        3, m3
2607    RESTORELINE_W8        4, m4
2608    jmp .hv_w8_loop
2609.hv_w8_outer:
2610%if ARCH_X86_32
2611    mov                dstq, dstm
2612    add                  r4, 4
2613    movzx                hd, r6w
2614    add                dstq, 4
2615    mov                srcq, r4
2616    mov                dstm, dstq
2617%else
2618    add                  r4, 4
2619    add                  r7, 4
2620    movzx                hd, r6b
2621    mov                srcq, r4
2622    mov                dstq, r7
2623%endif
2624    sub                 r6d, 1<<16
2625    jg .hv_w8_loop0
2626    RET
2627
2628%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask
2629 %if cpuflag(ssse3)
2630    pshufb               %1, %2
2631 %else
2632  %if %5 == 1
2633    pcmpeqd              %2, %2
2634    psrlq                %2, 32
2635  %endif
2636    psrldq               %3, %1, 1
2637    pshufd               %3, %3, q2301
2638    pand                 %1, %2
2639    pandn                %4, %2, %3
2640    por                  %1, %4
2641 %endif
2642%endmacro
2643
2644%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
2645 %ifnidn %1, %2
2646    mova                 %1, %2
2647 %endif
2648    PSHUFB_SUBPEL_H_4    %1, %3, %4, %5, %6
2649%endmacro
2650
2651%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
2652 %if notcpuflag(ssse3)
2653    psrlq                %1, %2, 16
2654 %elifnidn %1, %2
2655    mova                 %1, %2
2656 %endif
2657    PSHUFB_SUBPEL_H_4    %1, %3, %4, %5, %6
2658%endmacro
2659
2660%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp]
2661 %if cpuflag(ssse3)
2662    palignr              %1, %2, %3, %4
2663 %else
2664  %if %0 == 4
2665   %assign %%i regnumof%+%1 + 1
2666   %define %%tmp m %+ %%i
2667  %else
2668   %define %%tmp %5
2669  %endif
2670    psrldq               %1, %3, %4
2671    pslldq            %%tmp, %2, 16-%4
2672    por                  %1, %%tmp
2673 %endif
2674%endmacro
2675
2676%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
2677 %if cpuflag(ssse3)
2678    phaddw               %1, %2
2679 %elifnidn %1, %2
2680   %if %4 == 1
2681    mova                 %3, [base+pw_1]
2682   %endif
2683    pmaddwd              %1, %3
2684    pmaddwd              %2, %3
2685    packssdw             %1, %2
2686 %else
2687   %if %4 == 1
2688    pmaddwd              %1, [base+pw_1]
2689   %else
2690    pmaddwd              %1, %3
2691   %endif
2692    packssdw             %1, %1
2693 %endif
2694%endmacro
2695
2696%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift
2697 %if cpuflag(ssse3)
2698    pmulhrsw             %1, %2, %3
2699 %else
2700    paddw                %1, %2, %3
2701    psraw                %1, %4
2702 %endif
2703%endmacro
2704
2705%macro PMULHRSW_8192 3 ; dst, src1, src2
2706    PMULHRSW_POW2        %1, %2, %3, 2
2707%endmacro
2708
2709%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2]
2710   movd                  %1, [%2+0]
2711   movd                  %3, [%2+1]
2712   movd                  %4, [%2+2]
2713   movd                  %5, [%2+3]
2714   punpckldq             %1, %3
2715   punpckldq             %4, %5
2716   punpcklqdq            %1, %4
2717%endmacro
2718
2719%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc
2720 %if cpuflag(ssse3)
2721    movu                m%1, [%2]
2722    pshufb               m2, m%1, m11 ; subpel_h_shufB
2723    pshufb               m3, m%1, m9  ; subpel_h_shufC
2724    pshufb              m%1, m10      ; subpel_h_shufA
2725 %else
2726  %if ARCH_X86_64
2727    SWAP                m12, m5
2728    SWAP                m13, m6
2729    SWAP                m14, m7
2730   %define %%mx0 m%+%%i
2731   %define %%mx1 m%+%%j
2732   %assign %%i 0
2733   %rep 12
2734    movd              %%mx0, [%2+%%i]
2735    %assign %%i %%i+1
2736   %endrep
2737   %assign %%i 0
2738   %rep 6
2739    %assign %%j %%i+1
2740    punpckldq         %%mx0, %%mx1
2741    %assign %%i %%i+2
2742   %endrep
2743   %assign %%i 0
2744   %rep 3
2745    %assign %%j %%i+2
2746    punpcklqdq        %%mx0, %%mx1
2747    %assign %%i %%i+4
2748   %endrep
2749    SWAP                m%1, m0
2750    SWAP                 m2, m4
2751    SWAP                 m3, m8
2752    SWAP                 m5, m12
2753    SWAP                 m6, m13
2754    SWAP                 m7, m14
2755  %else
2756    PREP_8TAP_H_LOAD4    m0, %2+0, m1, m4, m7
2757    PREP_8TAP_H_LOAD4    m2, %2+4, m1, m4, m7
2758    PREP_8TAP_H_LOAD4    m3, %2+8, m1, m4, m7
2759    SWAP                m%1, m0
2760  %endif
2761 %endif
2762%endmacro
2763
2764%macro PREP_8TAP_H 2 ; dst, src_memloc
2765    PREP_8TAP_H_LOAD     %1, %2
2766 %if ARCH_X86_64 && notcpuflag(ssse3)
2767    SWAP                 m8, m1
2768    SWAP                 m9, m7
2769 %endif
2770 %xdefine mX m%+%1
2771 %assign %%i regnumof%+mX
2772 %define mX m%+%%i
2773    mova                 m4, m2
2774    PMADDUBSW            m4, m5, m1, m7, 1  ; subpel +0 B0
2775    PMADDUBSW            m2, m6, m1, m7, 0  ; subpel +4 B4
2776    PMADDUBSW            m3, m6, m1, m7, 0  ; subpel +4 C4
2777    PMADDUBSW            mX, m5, m1, m7, 0  ; subpel +0 A0
2778 %undef mX
2779 %if ARCH_X86_64 && notcpuflag(ssse3)
2780    SWAP                 m1, m8
2781    SWAP                 m7, m9
2782 %endif
2783    paddw                m3, m4
2784    paddw               m%1, m2
2785    PHADDW              m%1, m3, m15, ARCH_X86_32
2786 %if ARCH_X86_64 || cpuflag(ssse3)
2787    PMULHRSW_8192       m%1, m%1, m7
2788 %else
2789    PMULHRSW_8192       m%1, m%1, [base+pw_2]
2790 %endif
2791%endmacro
2792
2793%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2]
2794 %if cpuflag(ssse3)
2795    movu                 %1, [%2]
2796    pshufb               m2, %1, shufB
2797    pshufb               m3, %1, shufC
2798    pshufb               %1, shufA
2799 %else
2800    PREP_8TAP_H_LOAD4    %1, %2+0, m1, %3, %4
2801    PREP_8TAP_H_LOAD4    m2, %2+4, m1, %3, %4
2802    PREP_8TAP_H_LOAD4    m3, %2+8, m1, %3, %4
2803 %endif
2804    mova                 m1, m2
2805    PMADDUBSW            m1, subpelh0, %3, %4, 1 ; subpel +0 C0
2806    PMADDUBSW            m3, subpelh1, %3, %4, 0 ; subpel +4 B4
2807    PMADDUBSW            m2, subpelh1, %3, %4, 0 ; C4
2808    PMADDUBSW            %1, subpelh0, %3, %4, 0 ; A0
2809    paddw                m1, m3           ; C0+B4
2810    paddw                %1, m2           ; A0+C4
2811    PHADDW               %1, m1, %3, 1
2812%endmacro
2813
2814%macro PREP_8TAP 0
2815%if ARCH_X86_32
2816 DECLARE_REG_TMP 1, 2
2817%elif WIN64
2818 DECLARE_REG_TMP 6, 4
2819%else
2820 DECLARE_REG_TMP 6, 7
2821%endif
2822
2823FN prep_8tap, sharp,          SHARP,   SHARP
2824FN prep_8tap, sharp_smooth,   SHARP,   SMOOTH
2825FN prep_8tap, smooth_sharp,   SMOOTH,  SHARP
2826FN prep_8tap, smooth,         SMOOTH,  SMOOTH
2827FN prep_8tap, sharp_regular,  SHARP,   REGULAR
2828FN prep_8tap, regular_sharp,  REGULAR, SHARP
2829FN prep_8tap, smooth_regular, SMOOTH,  REGULAR
2830FN prep_8tap, regular_smooth, REGULAR, SMOOTH
2831FN prep_8tap, regular,        REGULAR, REGULAR
2832
2833%if ARCH_X86_32
2834 %define base_reg r2
2835 %define base base_reg-prep%+SUFFIX
2836%else
2837 %define base_reg r7
2838 %define base 0
2839%endif
2840cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
2841    imul                mxd, mxm, 0x010101
2842    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2843    imul                myd, mym, 0x010101
2844    add                 myd, t1d ; 8tap_v, my, 4tap_v
2845    mov                  wd, wm
2846    movifnidn          srcd, srcm
2847    movifnidn            hd, hm
2848    test                mxd, 0xf00
2849    jnz .h
2850    test                myd, 0xf00
2851    jnz .v
2852    LEA            base_reg, prep_ssse3
2853    tzcnt                wd, wd
2854    movzx                wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
2855    pxor                 m4, m4
2856    add                  wq, base_reg
2857    movifnidn       strided, stridem
2858    lea                  r6, [strideq*3]
2859%if WIN64
2860    pop                  r8
2861    pop                  r7
2862%endif
2863    jmp                  wq
2864.h:
2865    LEA            base_reg, prep%+SUFFIX
2866    test                myd, 0xf00
2867    jnz .hv
2868%if cpuflag(ssse3)
2869    WIN64_SPILL_XMM      12
2870%else
2871    WIN64_SPILL_XMM      16
2872%endif
2873%if ARCH_X86_32
2874 %define strideq r6
2875    mov             strideq, stridem
2876%endif
2877    cmp                  wd, 4
2878    je .h_w4
2879    tzcnt                wd, wd
2880%if cpuflag(ssse3)
2881 %if ARCH_X86_64
2882    mova                m10, [base+subpel_h_shufA]
2883    mova                m11, [base+subpel_h_shufB]
2884    mova                 m9, [base+subpel_h_shufC]
2885 %else
2886  %define m10 [base+subpel_h_shufA]
2887  %define m11 [base+subpel_h_shufB]
2888  %define m9  [base+subpel_h_shufC]
2889 %endif
2890%endif
2891    shr                 mxd, 16
2892    sub                srcq, 3
2893    movzx                wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
2894    movq                 m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
2895%if cpuflag(ssse3)
2896    mova                 m7, [base+pw_8192]
2897    pshufd               m5, m6, q0000
2898    pshufd               m6, m6, q1111
2899%else
2900    punpcklbw            m6, m6
2901    psraw                m6, 8
2902 %if ARCH_X86_64
2903    mova                 m7, [pw_2]
2904    mova                m15, [pw_1]
2905 %else
2906  %define m15 m4
2907 %endif
2908    pshufd               m5, m6, q1010
2909    punpckhqdq           m6, m6
2910%endif
2911    add                  wq, base_reg
2912    jmp                  wq
2913.h_w4:
2914%if ARCH_X86_32
2915    and                 mxd, 0x7f
2916%else
2917    movzx               mxd, mxb
2918%endif
2919    dec                srcq
2920    movd                 m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
2921%if cpuflag(ssse3)
2922    mova                 m6, [base+pw_8192]
2923    mova                 m5, [base+subpel_h_shufA]
2924    pshufd               m4, m4, q0000
2925%else
2926    mova                 m6, [base+pw_2]
2927 %if ARCH_X86_64
2928    mova                m14, [pw_1]
2929 %else
2930  %define m14 m7
2931 %endif
2932    punpcklbw            m4, m4
2933    psraw                m4, 8
2934    punpcklqdq           m4, m4
2935%endif
2936%if ARCH_X86_64
2937    lea            stride3q, [strideq*3]
2938%endif
2939.h_w4_loop:
2940%if cpuflag(ssse3)
2941    movq                 m0, [srcq+strideq*0] ; 0
2942    movq                 m1, [srcq+strideq*1] ; 1
2943 %if ARCH_X86_32
2944    lea                srcq, [srcq+strideq*2]
2945    movq                 m2, [srcq+strideq*0] ; 2
2946    movq                 m3, [srcq+strideq*1] ; 3
2947    lea                srcq, [srcq+strideq*2]
2948 %else
2949    movq                 m2, [srcq+strideq*2] ; 2
2950    movq                 m3, [srcq+stride3q ] ; 3
2951    lea                srcq, [srcq+strideq*4]
2952 %endif
2953    pshufb               m0, m5
2954    pshufb               m1, m5
2955    pshufb               m2, m5
2956    pshufb               m3, m5
2957%elif ARCH_X86_64
2958    movd                 m0, [srcq+strideq*0+0]
2959    movd                m12, [srcq+strideq*0+1]
2960    movd                 m1, [srcq+strideq*1+0]
2961    movd                 m5, [srcq+strideq*1+1]
2962    movd                 m2, [srcq+strideq*2+0]
2963    movd                m13, [srcq+strideq*2+1]
2964    movd                 m3, [srcq+stride3q +0]
2965    movd                 m7, [srcq+stride3q +1]
2966    punpckldq            m0, m12
2967    punpckldq            m1, m5
2968    punpckldq            m2, m13
2969    punpckldq            m3, m7
2970    movd                m12, [srcq+strideq*0+2]
2971    movd                 m8, [srcq+strideq*0+3]
2972    movd                 m5, [srcq+strideq*1+2]
2973    movd                 m9, [srcq+strideq*1+3]
2974    movd                m13, [srcq+strideq*2+2]
2975    movd                m10, [srcq+strideq*2+3]
2976    movd                 m7, [srcq+stride3q +2]
2977    movd                m11, [srcq+stride3q +3]
2978    lea                srcq, [srcq+strideq*4]
2979    punpckldq           m12, m8
2980    punpckldq            m5, m9
2981    punpckldq           m13, m10
2982    punpckldq            m7, m11
2983    punpcklqdq           m0, m12 ; 0
2984    punpcklqdq           m1, m5  ; 1
2985    punpcklqdq           m2, m13 ; 2
2986    punpcklqdq           m3, m7  ; 3
2987%else
2988    movd                 m0, [srcq+strideq*0+0]
2989    movd                 m1, [srcq+strideq*0+1]
2990    movd                 m2, [srcq+strideq*0+2]
2991    movd                 m3, [srcq+strideq*0+3]
2992    punpckldq            m0, m1
2993    punpckldq            m2, m3
2994    punpcklqdq           m0, m2 ; 0
2995    movd                 m1, [srcq+strideq*1+0]
2996    movd                 m2, [srcq+strideq*1+1]
2997    movd                 m3, [srcq+strideq*1+2]
2998    movd                 m7, [srcq+strideq*1+3]
2999    lea                srcq, [srcq+strideq*2]
3000    punpckldq            m1, m2
3001    punpckldq            m3, m7
3002    punpcklqdq           m1, m3 ; 1
3003    movd                 m2, [srcq+strideq*0+0]
3004    movd                 m3, [srcq+strideq*0+1]
3005    movd                 m7, [srcq+strideq*0+2]
3006    movd                 m5, [srcq+strideq*0+3]
3007    punpckldq            m2, m3
3008    punpckldq            m7, m5
3009    punpcklqdq           m2, m7 ; 2
3010    movd                 m3, [srcq+strideq*1+0]
3011    movd                 m7, [srcq+strideq*1+1]
3012    punpckldq            m3, m7
3013    movd                 m7, [srcq+strideq*1+2]
3014    movd                 m5, [srcq+strideq*1+3]
3015    lea                srcq, [srcq+strideq*2]
3016    punpckldq            m7, m5
3017    punpcklqdq           m3, m7 ; 3
3018%endif
3019    PMADDUBSW            m0, m4, m5, m7, 1 ; subpel_filters + 2
3020    PMADDUBSW            m1, m4, m5, m7, 0
3021    PMADDUBSW            m2, m4, m5, m7, 0
3022    PMADDUBSW            m3, m4, m5, m7, 0
3023    PHADDW               m0, m1, m14, ARCH_X86_32
3024    PHADDW               m2, m3, m14, 0
3025    PMULHRSW_8192        m0, m0, m6
3026    PMULHRSW_8192        m2, m2, m6
3027    mova        [tmpq+16*0], m0
3028    mova        [tmpq+16*1], m2
3029    add                tmpq, 32
3030    sub                  hd, 4
3031    jg .h_w4_loop
3032    RET
3033.h_w8:
3034%if cpuflag(ssse3)
3035    PREP_8TAP_H           0, srcq+strideq*0
3036    PREP_8TAP_H           1, srcq+strideq*1
3037    mova        [tmpq+16*0], m0
3038    mova        [tmpq+16*1], m1
3039    lea                srcq, [srcq+strideq*2]
3040    add                tmpq, 32
3041    sub                  hd, 2
3042%else
3043    PREP_8TAP_H           0, srcq
3044    mova             [tmpq], m0
3045    add                srcq, strideq
3046    add                tmpq, 16
3047    dec                  hd
3048%endif
3049    jg .h_w8
3050    RET
3051.h_w16:
3052    mov                  r3, -16*1
3053    jmp .h_start
3054.h_w32:
3055    mov                  r3, -16*2
3056    jmp .h_start
3057.h_w64:
3058    mov                  r3, -16*4
3059    jmp .h_start
3060.h_w128:
3061    mov                  r3, -16*8
3062.h_start:
3063    sub                srcq, r3
3064    mov                  r5, r3
3065.h_loop:
3066%if cpuflag(ssse3)
3067    PREP_8TAP_H           0, srcq+r3+8*0
3068    PREP_8TAP_H           1, srcq+r3+8*1
3069    mova        [tmpq+16*0], m0
3070    mova        [tmpq+16*1], m1
3071    add                tmpq, 32
3072    add                  r3, 16
3073%else
3074    PREP_8TAP_H           0, srcq+r3
3075    mova             [tmpq], m0
3076    add                tmpq, 16
3077    add                  r3, 8
3078%endif
3079    jl .h_loop
3080    add                srcq, strideq
3081    mov                  r3, r5
3082    dec                  hd
3083    jg .h_loop
3084    RET
3085.v:
3086    LEA            base_reg, prep%+SUFFIX
3087%if ARCH_X86_32
3088    mov                 mxd, myd
3089    and                 mxd, 0x7f
3090%else
3091    WIN64_SPILL_XMM      16
3092    movzx               mxd, myb
3093%endif
3094    shr                 myd, 16
3095    cmp                  hd, 6
3096    cmovs               myd, mxd
3097    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
3098%if cpuflag(ssse3)
3099    mova                 m2, [base+pw_512]
3100    mova                 m7, [base+pw_8192]
3101    punpcklwd            m0, m0
3102%else
3103    punpcklbw            m0, m0
3104    psraw                m0, 8
3105%endif
3106%if ARCH_X86_32
3107 %define            subpel0  [rsp+mmsize*0]
3108 %define            subpel1  [rsp+mmsize*1]
3109 %define            subpel2  [rsp+mmsize*2]
3110 %define            subpel3  [rsp+mmsize*3]
3111%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
3112 %if cpuflag(ssse3)
3113    ALLOC_STACK   -mmsize*4
3114 %else
3115    ALLOC_STACK   -mmsize*5
3116 %endif
3117%assign regs_used 7
3118    mov             strideq, [rstk+stack_offset+gprsize*3]
3119    pshufd               m1, m0, q0000
3120    mova            subpel0, m1
3121    pshufd               m1, m0, q1111
3122    mova            subpel1, m1
3123    lea                  r5, [strideq*3]
3124    pshufd               m1, m0, q2222
3125    mova            subpel2, m1
3126    pshufd               m1, m0, q3333
3127    mova            subpel3, m1
3128    sub                srcq, r5
3129%else
3130 %define            subpel0  m8
3131 %define            subpel1  m9
3132 %define            subpel2  m10
3133 %define            subpel3  m11
3134    pshufd               m8, m0, q0000
3135    pshufd               m9, m0, q1111
3136    lea            stride3q, [strideq*3]
3137    pshufd              m10, m0, q2222
3138    pshufd              m11, m0, q3333
3139    sub                srcq, stride3q
3140    cmp                  wd, 8
3141    jns .v_w8
3142%endif
3143.v_w4:
3144%if notcpuflag(ssse3)
3145    pxor                 m6, m6
3146 %if ARCH_X86_64
3147    mova                 m7, [base+pw_2]
3148 %endif
3149%endif
3150%if ARCH_X86_32
3151 %if STACK_ALIGNMENT < mmsize
3152  %define srcm [esp+stack_size+gprsize*1]
3153  %define tmpm [esp+stack_size+gprsize*2]
3154 %endif
3155    mov                tmpm, tmpq
3156    mov                srcm, srcq
3157    lea                 r5d, [wq - 4] ; horizontal loop
3158    shl                 r5d, (16 - 2)  ; (wq / 4) << 16
3159    mov                 r5w, hw
3160.v_w4_loop0:
3161%endif
3162    movd                 m1, [srcq+strideq*0]
3163    movd                 m0, [srcq+strideq*1]
3164%if ARCH_X86_32
3165    lea                srcq, [srcq+strideq*2]
3166    movd                 m2, [srcq+strideq*0]
3167    movd                 m4, [srcq+strideq*1]
3168    lea                srcq, [srcq+strideq*2]
3169    movd                 m3, [srcq+strideq*0]
3170    movd                 m5, [srcq+strideq*1]
3171    lea                srcq, [srcq+strideq*2]
3172%else
3173    movd                 m2, [srcq+strideq*2]
3174    add                srcq, stride3q
3175    movd                 m4, [srcq+strideq*0]
3176    movd                 m3, [srcq+strideq*1]
3177    movd                 m5, [srcq+strideq*2]
3178    add                srcq, stride3q
3179%endif
3180    punpckldq            m1, m0 ; 0 1
3181    punpckldq            m0, m2 ; 1 2
3182    punpcklbw            m1, m0 ; 01 12
3183    movd                 m0, [srcq+strideq*0]
3184    punpckldq            m2, m4 ; 2 3
3185    punpckldq            m4, m3 ; 3 4
3186    punpckldq            m3, m5 ; 4 5
3187    punpckldq            m5, m0 ; 5 6
3188    punpcklbw            m2, m4 ; 23 34
3189    punpcklbw            m3, m5 ; 45 56
3190.v_w4_loop:
3191%if ARCH_X86_32 && notcpuflag(ssse3)
3192    mova                 m7, subpel0
3193 %define subpel0 m7
3194%endif
3195    mova                 m5, m1
3196    PMADDUBSW            m5, subpel0, m6, m4, 0  ; a0 b0
3197%if ARCH_X86_32 && notcpuflag(ssse3)
3198    mova                 m7, subpel1
3199 %define subpel1 m7
3200%endif
3201    mova                 m1, m2
3202    PMADDUBSW            m2, subpel1, m6, m4, 0  ; a1 b1
3203    paddw                m5, m2
3204%if ARCH_X86_32 && notcpuflag(ssse3)
3205    mova                 m7, subpel2
3206 %define subpel2 m7
3207%endif
3208    mova                 m2, m3
3209    PMADDUBSW            m3, subpel2, m6, m4, 0  ; a2 b2
3210    movd                 m4, [srcq+strideq*1]
3211    lea                srcq, [srcq+strideq*2]
3212    paddw                m5, m3
3213    punpckldq            m3, m0, m4       ; 6 7 _ _
3214    movd                 m0, [srcq+strideq*0]
3215    punpckldq            m4, m0           ; 7 8 _ _
3216    punpcklbw            m3, m4           ; 67 78
3217%if notcpuflag(ssse3)
3218 %if ARCH_X86_64
3219    SWAP                m12, m0
3220 %else
3221    mova     [esp+mmsize*4], m0
3222    mova                 m7, subpel3
3223  %define subpel3 m7
3224 %endif
3225%endif
3226    mova                 m4, m3
3227    PMADDUBSW            m4, subpel3, m6, m0, 0  ; a3 b3
3228    paddw                m5, m4
3229%if ARCH_X86_64 || cpuflag(ssse3)
3230 %if notcpuflag(ssse3)
3231    SWAP                 m0, m12
3232 %endif
3233    PMULHRSW_8192        m5, m5, m7
3234%else
3235    mova                 m0, [esp+mmsize*4]
3236    PMULHRSW_8192        m5, m5, [base+pw_2]
3237%endif
3238    movq        [tmpq+wq*0], m5
3239    movhps      [tmpq+wq*2], m5
3240    lea                tmpq, [tmpq+wq*4]
3241    sub                  hd, 2
3242    jg .v_w4_loop
3243%if ARCH_X86_32
3244    mov                srcq, srcm
3245    mov                tmpq, tmpm
3246    movzx                hd, r5w
3247    add                srcq, 4
3248    add                tmpq, 8
3249    mov                srcm, srcq
3250    mov                tmpm, tmpq
3251    sub                 r5d, 1<<16 ; horizontal--
3252    jg .v_w4_loop0
3253%endif
3254    RET
3255%if ARCH_X86_64
3256.v_w8:
3257    lea                 r6d, [wq*8-64]
3258    mov                  r5, srcq
3259    mov                  r8, tmpq
3260    lea                 r6d, [hq+r6*4]
3261.v_w8_loop0:
3262    movq                 m1, [srcq+strideq*0]
3263    movq                 m2, [srcq+strideq*1]
3264    movq                 m3, [srcq+strideq*2]
3265    add                srcq, stride3q
3266    movq                 m4, [srcq+strideq*0]
3267    movq                 m5, [srcq+strideq*1]
3268    movq                 m6, [srcq+strideq*2]
3269    add                srcq, stride3q
3270    movq                 m0, [srcq+strideq*0]
3271    punpcklbw            m1, m2 ; 01
3272    punpcklbw            m2, m3 ; 12
3273    punpcklbw            m3, m4 ; 23
3274    punpcklbw            m4, m5 ; 34
3275    punpcklbw            m5, m6 ; 45
3276    punpcklbw            m6, m0 ; 56
3277.v_w8_loop:
3278    movq                m13, [srcq+strideq*1]
3279    lea                srcq, [srcq+strideq*2]
3280%if cpuflag(ssse3)
3281    pmaddubsw           m14, m1, subpel0 ; a0
3282    pmaddubsw           m15, m2, subpel0 ; b0
3283    mova                 m1, m3
3284    mova                 m2, m4
3285    pmaddubsw            m3, subpel1 ; a1
3286    pmaddubsw            m4, subpel1 ; b1
3287    paddw               m14, m3
3288    paddw               m15, m4
3289    mova                 m3, m5
3290    mova                 m4, m6
3291    pmaddubsw            m5, subpel2 ; a2
3292    pmaddubsw            m6, subpel2 ; b2
3293    punpcklbw           m12, m0, m13 ; 67
3294    movq                 m0, [srcq+strideq*0]
3295    punpcklbw           m13, m0      ; 78
3296    paddw               m14, m5
3297    mova                 m5, m12
3298    pmaddubsw           m12, subpel3 ; a3
3299    paddw               m15, m6
3300    mova                 m6, m13
3301    pmaddubsw           m13, subpel3 ; b3
3302    paddw               m14, m12
3303    paddw               m15, m13
3304    pmulhrsw            m14, m7
3305    pmulhrsw            m15, m7
3306%else
3307    mova                m14, m1
3308    PMADDUBSW           m14, subpel0, m7, m12, 1 ; a0
3309    mova                m15, m2
3310    PMADDUBSW           m15, subpel0, m7, m12, 0 ; b0
3311    mova                 m1, m3
3312    PMADDUBSW            m3, subpel1, m7, m12, 0 ; a1
3313    mova                 m2, m4
3314    PMADDUBSW            m4, subpel1, m7, m12, 0 ; b1
3315    paddw               m14, m3
3316    mova                 m3, m5
3317    PMADDUBSW            m5, subpel2, m7, m12, 0 ; a2
3318    paddw               m15, m4
3319    mova                 m4, m6
3320    PMADDUBSW            m6, subpel2, m7, m12, 0 ; b2
3321    paddw               m15, m6
3322    punpcklbw           m12, m0, m13 ; 67
3323    movq                 m0, [srcq+strideq*0]
3324    punpcklbw           m13, m0      ; 78
3325    paddw               m14, m5
3326    mova                 m5, m12
3327    PMADDUBSW           m12, subpel3, m7, m6, 0  ; a3
3328    paddw               m14, m12
3329    mova                 m6, m13
3330    PMADDUBSW           m13, subpel3, m7, m12, 0 ; b3
3331    paddw               m15, m13
3332    PMULHRSW_8192       m14, m14, [base+pw_2]
3333    PMULHRSW_8192       m15, m15, [base+pw_2]
3334%endif
3335    movu        [tmpq+wq*0], m14
3336    movu        [tmpq+wq*2], m15
3337    lea                tmpq, [tmpq+wq*4]
3338    sub                  hd, 2
3339    jg .v_w8_loop
3340    add                  r5, 8
3341    add                  r8, 16
3342    movzx                hd, r6b
3343    mov                srcq, r5
3344    mov                tmpq, r8
3345    sub                 r6d, 1<<8
3346    jg .v_w8_loop0
3347    RET
3348%endif ;ARCH_X86_64
3349%undef subpel0
3350%undef subpel1
3351%undef subpel2
3352%undef subpel3
3353.hv:
3354    RESET_STACK_STATE
3355    cmp                  wd, 4
3356    jg .hv_w8
3357    and                 mxd, 0x7f
3358    movd                 m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
3359%if ARCH_X86_32
3360    mov                 mxd, myd
3361    shr                 myd, 16
3362    and                 mxd, 0x7f
3363    cmp                  hd, 6
3364    cmovs               myd, mxd
3365    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
3366    mov             strideq, stridem
3367 %assign regs_used 6
3368    ALLOC_STACK  -mmsize*14
3369 %assign regs_used 7
3370    lea                  r5, [strideq*3+1]
3371    sub                srcq, r5
3372 %define           subpelv0  [rsp+mmsize*0]
3373 %define           subpelv1  [rsp+mmsize*1]
3374 %define           subpelv2  [rsp+mmsize*2]
3375 %define           subpelv3  [rsp+mmsize*3]
3376    punpcklbw            m0, m0
3377    psraw                m0, 8
3378    pshufd               m6, m0, q0000
3379    mova           subpelv0, m6
3380    pshufd               m6, m0, q1111
3381    mova           subpelv1, m6
3382    pshufd               m6, m0, q2222
3383    mova           subpelv2, m6
3384    pshufd               m6, m0, q3333
3385    mova           subpelv3, m6
3386%else
3387    movzx               mxd, myb
3388    shr                 myd, 16
3389    cmp                  hd, 6
3390    cmovs               myd, mxd
3391    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
3392 %if cpuflag(ssse3)
3393    ALLOC_STACK   mmsize*14, 14
3394 %else
3395    ALLOC_STACK   mmsize*14, 16
3396 %endif
3397    lea            stride3q, [strideq*3]
3398    sub                srcq, stride3q
3399    dec                srcq
3400 %define           subpelv0  m10
3401 %define           subpelv1  m11
3402 %define           subpelv2  m12
3403 %define           subpelv3  m13
3404    punpcklbw            m0, m0
3405    psraw                m0, 8
3406 %if cpuflag(ssse3)
3407    mova                 m8, [base+pw_8192]
3408 %else
3409    mova                 m8, [base+pw_2]
3410 %endif
3411    mova                 m9, [base+pd_32]
3412    pshufd              m10, m0, q0000
3413    pshufd              m11, m0, q1111
3414    pshufd              m12, m0, q2222
3415    pshufd              m13, m0, q3333
3416%endif
3417    pshufd               m7, m1, q0000
3418%if notcpuflag(ssse3)
3419    punpcklbw            m7, m7
3420    psraw                m7, 8
3421%endif
3422%define hv4_line_0_0 4
3423%define hv4_line_0_1 5
3424%define hv4_line_0_2 6
3425%define hv4_line_0_3 7
3426%define hv4_line_0_4 8
3427%define hv4_line_0_5 9
3428%define hv4_line_1_0 10
3429%define hv4_line_1_1 11
3430%define hv4_line_1_2 12
3431%define hv4_line_1_3 13
3432%if ARCH_X86_32
3433 %if cpuflag(ssse3)
3434  %define          w8192reg  [base+pw_8192]
3435 %else
3436  %define          w8192reg  [base+pw_2]
3437 %endif
3438 %define             d32reg  [base+pd_32]
3439%else
3440 %define           w8192reg  m8
3441 %define             d32reg  m9
3442%endif
3443    ; lower shuffle 0 1 2 3 4
3444%if cpuflag(ssse3)
3445    mova                 m6, [base+subpel_h_shuf4]
3446%else
3447 %if ARCH_X86_64
3448    mova                m15, [pw_1]
3449 %else
3450  %define               m15 m1
3451 %endif
3452%endif
3453    movq                 m5, [srcq+strideq*0]   ; 0 _ _ _
3454    movhps               m5, [srcq+strideq*1]   ; 0 _ 1 _
3455%if ARCH_X86_32
3456    lea                srcq, [srcq+strideq*2]
3457    movq                 m4, [srcq+strideq*0]   ; 2 _ _ _
3458    movhps               m4, [srcq+strideq*1]   ; 2 _ 3 _
3459    lea                srcq, [srcq+strideq*2]
3460%else
3461    movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
3462    movhps               m4, [srcq+stride3q ]   ; 2 _ 3 _
3463    lea                srcq, [srcq+strideq*4]
3464%endif
3465    PSHUFB_SUBPEL_H_4a   m2, m5, m6, m1, m3, 1    ;H subpel_h_shuf4 0~1~
3466    PSHUFB_SUBPEL_H_4a   m0, m4, m6, m1, m3, 0    ;H subpel_h_shuf4 2~3~
3467    PMADDUBSW            m2, m7, m1, m3, 1        ;H subpel_filters
3468    PMADDUBSW            m0, m7, m1, m3, 0        ;H subpel_filters
3469    PHADDW               m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
3470    PMULHRSW_8192        m2, m2, w8192reg
3471    SAVELINE_W4          m2, 2, 0
3472    ; upper shuffle 2 3 4 5 6
3473%if cpuflag(ssse3)
3474    mova                 m6, [base+subpel_h_shuf4+16]
3475%endif
3476    PSHUFB_SUBPEL_H_4b   m2, m5, m6, m1, m3, 0    ;H subpel_h_shuf4 0~1~
3477    PSHUFB_SUBPEL_H_4b   m0, m4, m6, m1, m3, 0    ;H subpel_h_shuf4 2~3~
3478    PMADDUBSW            m2, m7, m1, m3, 1        ;H subpel_filters
3479    PMADDUBSW            m0, m7, m1, m3, 0        ;H subpel_filters
3480    PHADDW               m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
3481    PMULHRSW_8192        m2, m2, w8192reg
3482%if notcpuflag(ssse3)
3483 %if ARCH_X86_64
3484    SWAP                m14, m2
3485 %else
3486    mova     [esp+mmsize*4], m2
3487 %endif
3488%endif
3489    ; lower shuffle
3490%if cpuflag(ssse3)
3491    mova                 m6, [base+subpel_h_shuf4]
3492%endif
3493    movq                 m5, [srcq+strideq*0]   ; 4 _ _ _
3494    movhps               m5, [srcq+strideq*1]   ; 4 _ 5 _
3495%if ARCH_X86_32
3496    lea                srcq, [srcq+strideq*2]
3497    movq                 m4, [srcq+strideq*0]   ; 6 _ _ _
3498    add                srcq, strideq
3499%else
3500    movq                 m4, [srcq+strideq*2]   ; 6 _ _ _
3501    add                srcq, stride3q
3502%endif
3503    PSHUFB_SUBPEL_H_4a   m3, m5, m6, m1, m2, 0    ;H subpel_h_shuf4 4~5~
3504    PSHUFB_SUBPEL_H_4a   m0, m4, m6, m1, m2, 0    ;H subpel_h_shuf4 6~6~
3505    PMADDUBSW            m3, m7, m1, m2, 1        ;H subpel_filters
3506    PMADDUBSW            m0, m7, m1, m2, 0        ;H subpel_filters
3507    PHADDW               m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
3508    PMULHRSW_8192        m3, m3, w8192reg
3509    SAVELINE_W4          m3, 3, 0
3510    ; upper shuffle
3511%if cpuflag(ssse3)
3512    mova                 m6, [base+subpel_h_shuf4+16]
3513%endif
3514    PSHUFB_SUBPEL_H_4b   m3, m5, m6, m1, m2, 0    ;H subpel_h_shuf4 4~5~
3515    PSHUFB_SUBPEL_H_4b   m0, m4, m6, m1, m2, 0    ;H subpel_h_shuf4 6~6~
3516    PMADDUBSW            m3, m7, m1, m2, 1        ;H subpel_filters
3517    PMADDUBSW            m0, m7, m1, m2, 0        ;H subpel_filters
3518    PHADDW               m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
3519    PMULHRSW_8192        m3, m3, w8192reg
3520%if notcpuflag(ssse3)
3521 %if ARCH_X86_64
3522    SWAP                 m2, m14
3523 %else
3524    mova                 m2, [esp+mmsize*4]
3525 %endif
3526%endif
3527    ;process high
3528    PALIGNR              m4, m3, m2, 4;V 1 2 3 4
3529    punpcklwd            m1, m2, m4  ; V 01 12
3530    punpckhwd            m2, m4      ; V 23 34
3531    pshufd               m0, m3, q2121;V 5 6 5 6
3532    punpcklwd            m3, m0      ; V 45 56
3533    SAVELINE_W4          m0, 0, 1
3534    SAVELINE_W4          m1, 1, 1
3535    SAVELINE_W4          m2, 2, 1
3536    SAVELINE_W4          m3, 3, 1
3537    ;process low
3538    RESTORELINE_W4       m2, 2, 0
3539    RESTORELINE_W4       m3, 3, 0
3540    PALIGNR              m4, m3, m2, 4;V 1 2 3 4
3541    punpcklwd            m1, m2, m4  ; V 01 12
3542    punpckhwd            m2, m4      ; V 23 34
3543    pshufd               m0, m3, q2121;V 5 6 5 6
3544    punpcklwd            m3, m0      ; V 45 56
3545.hv_w4_loop:
3546    ;process low
3547    pmaddwd              m5, m1, subpelv0 ; V a0 b0
3548    mova                 m1, m2
3549    pmaddwd              m2, subpelv1; V a1 b1
3550    paddd                m5, m2
3551    mova                 m2, m3
3552    pmaddwd              m3, subpelv2; V a2 b2
3553    paddd                m5, m3
3554%if notcpuflag(ssse3)
3555 %if ARCH_X86_64
3556    SWAP                m14, m5
3557 %else
3558    mova     [esp+mmsize*4], m5
3559  %define m15 m3
3560 %endif
3561%endif
3562%if cpuflag(ssse3)
3563    mova                 m6, [base+subpel_h_shuf4]
3564%endif
3565    movq                 m4, [srcq+strideq*0] ; 7
3566    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
3567    PSHUFB_SUBPEL_H_4a   m4, m4, m6, m3, m5, 0    ; H subpel_h_shuf4 7~8~
3568    PMADDUBSW            m4, m7, m3, m5, 1        ; H subpel_filters
3569    PHADDW               m4, m4, m15, ARCH_X86_32 ; H                7878
3570    PMULHRSW_8192        m4, m4, w8192reg
3571    PALIGNR              m3, m4, m0, 12, m5       ;                  6787
3572    mova                 m0, m4
3573    punpcklwd            m3, m4      ; 67 78
3574    pmaddwd              m4, m3, subpelv3; a3 b3
3575%if notcpuflag(ssse3)
3576 %if ARCH_X86_64
3577    SWAP                 m5, m14
3578 %else
3579    mova                 m5, [esp+mmsize*4]
3580 %endif
3581%endif
3582    paddd                m5, d32reg ; pd_32
3583    paddd                m5, m4
3584    psrad                m5, 6
3585    SAVELINE_W4          m0, 0, 0
3586    SAVELINE_W4          m1, 1, 0
3587    SAVELINE_W4          m2, 2, 0
3588    SAVELINE_W4          m3, 3, 0
3589    SAVELINE_W4          m5, 5, 0
3590    ;process high
3591    RESTORELINE_W4       m0, 0, 1
3592    RESTORELINE_W4       m1, 1, 1
3593    RESTORELINE_W4       m2, 2, 1
3594    RESTORELINE_W4       m3, 3, 1
3595    pmaddwd              m5, m1, subpelv0; V a0 b0
3596    mova                 m1, m2
3597    pmaddwd              m2, subpelv1; V a1 b1
3598    paddd                m5, m2
3599    mova                 m2, m3
3600    pmaddwd              m3, subpelv2; V a2 b2
3601    paddd                m5, m3
3602%if notcpuflag(ssse3)
3603 %if ARCH_X86_64
3604    SWAP                m14, m5
3605 %else
3606    mova         [esp+0xA0], m5
3607 %endif
3608%endif
3609%if cpuflag(ssse3)
3610    mova                 m6, [base+subpel_h_shuf4+16]
3611%endif
3612    movq                 m4, [srcq+strideq*0] ; 7
3613    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
3614    PSHUFB_SUBPEL_H_4b   m4, m4, m6, m3, m5, 0    ; H subpel_h_shuf4 7~8~
3615    PMADDUBSW            m4, m7, m3, m5, 1        ; H subpel_filters
3616    PHADDW               m4, m4, m15, ARCH_X86_32 ; H                7878
3617    PMULHRSW_8192        m4, m4, w8192reg
3618    PALIGNR              m3, m4, m0, 12, m5       ;                  6787
3619    mova                 m0, m4
3620    punpcklwd            m3, m4      ; 67 78
3621    pmaddwd              m4, m3, subpelv3; a3 b3
3622%if notcpuflag(ssse3)
3623 %if ARCH_X86_64
3624    SWAP                 m5, m14
3625 %else
3626    mova                 m5, [esp+0xA0]
3627 %endif
3628%endif
3629    paddd                m5, d32reg ; pd_32
3630    paddd                m5, m4
3631    psrad                m4, m5, 6
3632    RESTORELINE_W4       m5, 5, 0
3633    packssdw             m5, m4
3634    pshufd               m5, m5, q3120
3635    movu             [tmpq], m5
3636    lea                srcq, [srcq+strideq*2]
3637    add                tmpq, 16
3638    sub                  hd, 2
3639    SAVELINE_W4          m0, 0, 1
3640    SAVELINE_W4          m1, 1, 1
3641    SAVELINE_W4          m2, 2, 1
3642    SAVELINE_W4          m3, 3, 1
3643    RESTORELINE_W4       m0, 0, 0
3644    RESTORELINE_W4       m1, 1, 0
3645    RESTORELINE_W4       m2, 2, 0
3646    RESTORELINE_W4       m3, 3, 0
3647    jg .hv_w4_loop
3648    RET
3649%undef subpelv0
3650%undef subpelv1
3651%undef subpelv2
3652%undef subpelv3
3653.hv_w8:
3654    RESET_STACK_STATE
3655%define hv8_line_1 0
3656%define hv8_line_2 1
3657%define hv8_line_3 2
3658%define hv8_line_4 3
3659%define hv8_line_6 4
3660    shr                 mxd, 16
3661%if ARCH_X86_32
3662 %define           subpelh0  [rsp+mmsize*5]
3663 %define           subpelh1  [rsp+mmsize*6]
3664 %define           subpelv0  [rsp+mmsize*7]
3665 %define           subpelv1  [rsp+mmsize*8]
3666 %define           subpelv2  [rsp+mmsize*9]
3667 %define           subpelv3  [rsp+mmsize*10]
3668 %define             accuv0  [rsp+mmsize*11]
3669 %define             accuv1  [rsp+mmsize*12]
3670    movq                 m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
3671    mov                 mxd, myd
3672    shr                 myd, 16
3673    and                 mxd, 0x7f
3674    cmp                  hd, 6
3675    cmovs               myd, mxd
3676    movq                 m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
3677    mov             strideq, stridem
3678 %assign regs_used 6
3679    ALLOC_STACK  -mmsize*14
3680 %assign regs_used 7
3681 %if STACK_ALIGNMENT < mmsize
3682  %define              tmpm  [rsp+mmsize*13+gprsize*1]
3683  %define              srcm  [rsp+mmsize*13+gprsize*2]
3684  %define           stridem  [rsp+mmsize*13+gprsize*3]
3685    mov                tmpm, tmpq
3686    mov             stridem, strideq
3687 %endif
3688 %if cpuflag(ssse3)
3689    pshufd               m0, m1, q0000
3690    pshufd               m1, m1, q1111
3691 %else
3692    punpcklbw            m1, m1
3693    psraw                m1, 8
3694    pshufd               m0, m1, q1010
3695    punpckhqdq           m1, m1
3696 %endif
3697    punpcklbw            m5, m5
3698    psraw                m5, 8
3699    pshufd               m2, m5, q0000
3700    pshufd               m3, m5, q1111
3701    pshufd               m4, m5, q2222
3702    pshufd               m5, m5, q3333
3703    mova           subpelh0, m0
3704    mova           subpelh1, m1
3705    mova           subpelv0, m2
3706    mova           subpelv1, m3
3707    mova           subpelv2, m4
3708    mova           subpelv3, m5
3709    lea                  r5, [strideq*3+3]
3710    sub                srcq, r5
3711    mov                srcm, srcq
3712%else
3713    ALLOC_STACK    mmsize*5, 16
3714 %define           subpelh0  m10
3715 %define           subpelh1  m11
3716 %define           subpelv0  m12
3717 %define           subpelv1  m13
3718 %define           subpelv2  m14
3719 %define           subpelv3  m15
3720 %define             accuv0  m8
3721 %define             accuv1  m9
3722    movq                 m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
3723    movzx               mxd, myb
3724    shr                 myd, 16
3725    cmp                  hd, 6
3726    cmovs               myd, mxd
3727    movq                 m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
3728 %if cpuflag(ssse3)
3729    pshufd         subpelh0, m0, q0000
3730    pshufd         subpelh1, m0, q1111
3731 %else
3732    punpcklbw            m0, m0
3733    psraw                m0, 8
3734    pshufd         subpelh0, m0, q1010
3735    pshufd         subpelh1, m0, q3232
3736    mova                 m7, [base+pw_2]
3737 %endif
3738    punpcklbw            m1, m1
3739    psraw                m1, 8
3740    pshufd         subpelv0, m1, q0000
3741    pshufd         subpelv1, m1, q1111
3742    pshufd         subpelv2, m1, q2222
3743    pshufd         subpelv3, m1, q3333
3744    lea            stride3q, [strideq*3]
3745    sub                srcq, 3
3746    sub                srcq, stride3q
3747    mov                  r6, srcq
3748    mov                  r8, tmpq
3749%endif
3750    lea                 r5d, [wq-4]
3751    shl                 r5d, 14
3752    add                 r5d, hd
3753.hv_w8_loop0:
3754%if cpuflag(ssse3)
3755 %if ARCH_X86_64
3756    mova                 m7, [base+subpel_h_shufA]
3757    mova                 m8, [base+subpel_h_shufB]
3758    mova                 m9, [base+subpel_h_shufC]
3759  %define shufA m7
3760  %define shufB m8
3761  %define shufC m9
3762 %else
3763  %define shufA [base+subpel_h_shufA]
3764  %define shufB [base+subpel_h_shufB]
3765  %define shufC [base+subpel_h_shufC]
3766 %endif
3767%endif
3768    PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
3769    PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
3770%if ARCH_X86_64
3771    PREP_8TAP_HV         m6, srcq+strideq*2, m7, m0
3772    add                srcq, stride3q
3773    PREP_8TAP_HV         m0, srcq+strideq*0, m7, m9
3774%else
3775    lea                srcq, [srcq+strideq*2]
3776 %if notcpuflag(ssse3)
3777    mova              [esp], m4
3778 %endif
3779    PREP_8TAP_HV         m6, srcq+strideq*0, m7, m4
3780    PREP_8TAP_HV         m0, srcq+strideq*1, m7, m4
3781    lea                srcq, [srcq+strideq*2]
3782%endif
3783%if cpuflag(ssse3)
3784    mova                 m7, [base+pw_8192]
3785%else
3786    mova                 m7, [base+pw_2]
3787 %if ARCH_X86_32
3788    mova                 m4, [esp]
3789 %endif
3790%endif
3791    PMULHRSW_8192        m4, m4, m7
3792    PMULHRSW_8192        m5, m5, m7
3793    PMULHRSW_8192        m6, m6, m7
3794    PMULHRSW_8192        m0, m0, m7
3795    punpcklwd            m1, m4, m5 ; 01
3796    punpcklwd            m2, m5, m6 ; 12
3797    punpcklwd            m3, m6, m0 ; 23
3798    SAVELINE_W8           1, m1
3799    SAVELINE_W8           2, m2
3800    SAVELINE_W8           3, m3
3801%if cpuflag(ssse3)
3802    mova                 m7, [base+subpel_h_shufA]
3803%endif
3804%if ARCH_X86_64
3805    PREP_8TAP_HV         m4, srcq+strideq*1, m8, m9
3806    PREP_8TAP_HV         m5, srcq+strideq*2, m8, m9
3807    add                srcq, stride3q
3808    PREP_8TAP_HV         m6, srcq+strideq*0, m8, m9
3809%else
3810 %if notcpuflag(ssse3)
3811    mova         [esp+0x30], m0
3812 %endif
3813    PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
3814    PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
3815    lea                srcq, [srcq+strideq*2]
3816    PREP_8TAP_HV         m6, srcq+strideq*0, m7, m0
3817%endif
3818%if cpuflag(ssse3)
3819    mova                 m7, [base+pw_8192]
3820%elif ARCH_X86_32
3821    mova                 m0, [esp+0x30]
3822    mova                 m7, [base+pw_2]
3823%endif
3824    PMULHRSW_8192        m1, m4, m7
3825    PMULHRSW_8192        m2, m5, m7
3826    PMULHRSW_8192        m3, m6, m7
3827    punpcklwd            m4, m0, m1 ; 34
3828    punpcklwd            m5, m1, m2 ; 45
3829    punpcklwd            m6, m2, m3 ; 56
3830    SAVELINE_W8           6, m3
3831    RESTORELINE_W8        1, m1
3832    RESTORELINE_W8        2, m2
3833    RESTORELINE_W8        3, m3
3834.hv_w8_loop:
3835    SAVELINE_W8           1, m3
3836    SAVELINE_W8           2, m4
3837    SAVELINE_W8           3, m5
3838    SAVELINE_W8           4, m6
3839%if ARCH_X86_32
3840    pmaddwd              m0, m1, subpelv0 ; a0
3841    pmaddwd              m7, m2, subpelv0 ; b0
3842    pmaddwd              m3, subpelv1     ; a1
3843    pmaddwd              m4, subpelv1     ; b1
3844    paddd                m0, m3
3845    paddd                m7, m4
3846    pmaddwd              m5, subpelv2     ; a2
3847    pmaddwd              m6, subpelv2     ; b2
3848    paddd                m0, m5
3849    paddd                m7, m6
3850    mova                 m5, [base+pd_32]
3851    paddd                m0, m5
3852    paddd                m7, m5
3853    mova             accuv0, m0
3854    mova             accuv1, m7
3855%else
3856    pmaddwd          accuv0, m1, subpelv0 ; a0
3857    pmaddwd          accuv1, m2, subpelv0 ; b0
3858    pmaddwd              m3, subpelv1     ; a1
3859    pmaddwd              m4, subpelv1     ; b1
3860    paddd            accuv0, m3
3861    paddd            accuv1, m4
3862    pmaddwd              m5, subpelv2     ; a2
3863    pmaddwd              m6, subpelv2     ; b2
3864    paddd            accuv0, m5
3865    paddd            accuv1, m6
3866    mova                 m7, [base+pd_32]
3867    paddd            accuv0, m7
3868    paddd            accuv1, m7
3869 %if cpuflag(ssse3)
3870    mova                 m7, [base+subpel_h_shufB]
3871    mova                 m6, [base+subpel_h_shufC]
3872    mova                 m5, [base+subpel_h_shufA]
3873  %define shufA m5
3874  %define shufB m7
3875  %define shufC m6
3876 %endif
3877%endif
3878    PREP_8TAP_HV         m0, srcq+strideq*1, m5, m6
3879    lea                srcq, [srcq+strideq*2]
3880    PREP_8TAP_HV         m4, srcq+strideq*0, m5, m6
3881%if cpuflag(ssse3)
3882    mova                 m5, [base+pw_8192]
3883%else
3884    mova                 m5, [base+pw_2]
3885%endif
3886    PMULHRSW_8192        m0, m0, m5
3887    PMULHRSW_8192        m4, m4, m5
3888    RESTORELINE_W8        6, m6
3889    punpcklwd            m5, m6, m0 ; 67
3890    punpcklwd            m6, m0, m4 ; 78
3891    pmaddwd              m1, m5, subpelv3 ; a3
3892    paddd                m2, m1, accuv0
3893    pmaddwd              m1, m6, subpelv3 ; b3
3894    paddd                m1, m1, accuv1
3895    psrad                m2, 6
3896    psrad                m1, 6
3897    packssdw             m2, m1
3898    movq        [tmpq+wq*0], m2
3899    movhps      [tmpq+wq*2], m2
3900    lea                tmpq, [tmpq+wq*4]
3901    sub                  hd, 2
3902    jle .hv_w8_outer
3903    SAVELINE_W8           6, m4
3904    RESTORELINE_W8        1, m1
3905    RESTORELINE_W8        2, m2
3906    RESTORELINE_W8        3, m3
3907    RESTORELINE_W8        4, m4
3908    jmp .hv_w8_loop
3909.hv_w8_outer:
3910%if ARCH_X86_32
3911    mov                srcq, srcm
3912    mov                tmpq, tmpm
3913    movzx                hd, r5w
3914    add                srcq, 4
3915    add                tmpq, 8
3916    mov                srcm, srcq
3917    mov                tmpm, tmpq
3918%else
3919    add                  r6, 4
3920    add                  r8, 8
3921    movzx                hd, r5b
3922    mov                srcq, r6
3923    mov                tmpq, r8
3924%endif
3925    sub                 r5d, 1<<16
3926    jg .hv_w8_loop0
3927    RET
3928%endmacro
3929
3930%macro movifprep 2
3931 %if isprep
3932    mov %1, %2
3933 %endif
3934%endmacro
3935
3936%macro SAVE_REG 1
3937 %xdefine r%1_save  r%1
3938 %xdefine r%1q_save r%1q
3939 %xdefine r%1d_save r%1d
3940 %if ARCH_X86_32
3941  %define r%1m_save [rstk+stack_offset+(%1+1)*4]
3942 %endif
3943%endmacro
3944
3945%macro LOAD_REG 1
3946 %xdefine r%1  r%1_save
3947 %xdefine r%1q r%1q_save
3948 %xdefine r%1d r%1d_save
3949 %if ARCH_X86_32
3950  %define r%1m r%1m_save
3951 %endif
3952 %undef r%1d_save
3953 %undef r%1q_save
3954 %undef r%1_save
3955%endmacro
3956
3957%macro REMAP_REG 2-3
3958 %xdefine r%1  r%2
3959 %xdefine r%1q r%2q
3960 %xdefine r%1d r%2d
3961 %if ARCH_X86_32
3962  %if %3 == 0
3963   %xdefine r%1m r%2m
3964  %else
3965   %define r%1m [rstk+stack_offset+(%1+1)*4]
3966  %endif
3967 %endif
3968%endmacro
3969
3970%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
3971 %if isprep
3972  %if ARCH_X86_64
3973   SAVE_REG 14
3974   %assign %%i 14
3975   %rep 14
3976    %assign %%j %%i-1
3977    REMAP_REG %%i, %%j
3978    %assign %%i %%i-1
3979   %endrep
3980  %else
3981   SAVE_REG 5
3982   %assign %%i 5
3983   %rep 5
3984    %assign %%j %%i-1
3985    REMAP_REG %%i, %%j, 0
3986    %assign %%i %%i-1
3987   %endrep
3988  %endif
3989 %endif
3990%endmacro
3991
3992%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
3993 %if isprep
3994  %assign %%i 1
3995  %if ARCH_X86_64
3996   %rep 13
3997    %assign %%j %%i+1
3998    REMAP_REG %%i, %%j
3999    %assign %%i %%i+1
4000   %endrep
4001   LOAD_REG 14
4002  %else
4003   %rep 4
4004    %assign %%j %%i+1
4005    REMAP_REG %%i, %%j, 1
4006    %assign %%i %%i+1
4007   %endrep
4008   LOAD_REG 5
4009  %endif
4010 %endif
4011%endmacro
4012
4013%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
4014    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
4015    RET
4016 %if %1
4017    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4018 %endif
4019%endmacro
4020
4021%if ARCH_X86_64
4022 %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3]
4023    SWAP                m%2, m%5
4024    movq                m%1, [srcq+ r4]
4025    movq                m%2, [srcq+ r6]
4026    movhps              m%1, [srcq+ r7]
4027    movhps              m%2, [srcq+ r9]
4028    movq                m%3, [srcq+r10]
4029    movq                m%4, [srcq+r11]
4030    movhps              m%3, [srcq+r13]
4031    movhps              m%4, [srcq+ rX]
4032    add                srcq, ssq
4033    movq                m%5, [srcq+ r4]
4034    movq                m%6, [srcq+ r6]
4035    movhps              m%5, [srcq+ r7]
4036    movhps              m%6, [srcq+ r9]
4037    movq                m%7, [srcq+r10]
4038    movq                m%8, [srcq+r11]
4039    movhps              m%7, [srcq+r13]
4040    movhps              m%8, [srcq+ rX]
4041    add                srcq, ssq
4042    pmaddubsw           m%1, m%9
4043    pmaddubsw           m%5, m%9
4044    pmaddubsw           m%2, m%10
4045    pmaddubsw           m%6, m%10
4046    pmaddubsw           m%3, m%11
4047    pmaddubsw           m%7, m%11
4048    pmaddubsw           m%4, m%12
4049    pmaddubsw           m%8, m%12
4050    phaddw              m%1, m%2
4051    phaddw              m%5, m%6
4052    phaddw              m%3, m%4
4053    phaddw              m%7, m%8
4054    phaddw              m%1, m%3
4055    phaddw              m%5, m%7
4056    pmulhrsw            m%1, m12
4057    pmulhrsw            m%5, m12
4058    SWAP                m%2, m%5
4059 %endmacro
4060%else
4061 %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets
4062  %if %3 == 1
4063    mov                  r0, [esp+ 0]
4064    mov                  rX, [esp+ 8]
4065    mov                  r4, [esp+ 4]
4066    mov                  r5, [esp+12]
4067  %endif
4068    movq                 m0, [srcq+r0]
4069    movq                 m1, [srcq+rX]
4070    movhps               m0, [srcq+r4]
4071    movhps               m1, [srcq+r5]
4072    add                srcq, ssq
4073    movq                 m4, [srcq+r0]
4074    movq                 m5, [srcq+rX]
4075    movhps               m4, [srcq+r4]
4076    movhps               m5, [srcq+r5]
4077    mov                  r0, [esp+16]
4078    mov                  rX, [esp+24]
4079    mov                  r4, [esp+20]
4080    mov                  r5, [esp+28]
4081    sub                srcq, ssq
4082    movq                 m2, [srcq+r0]
4083    movq                 m3, [srcq+rX]
4084    movhps               m2, [srcq+r4]
4085    movhps               m3, [srcq+r5]
4086    add                srcq, ssq
4087    movq                 m6, [srcq+r0]
4088    movq                 m7, [srcq+rX]
4089    movhps               m6, [srcq+r4]
4090    movhps               m7, [srcq+r5]
4091    add                srcq, ssq
4092    pmaddubsw            m0, [esp+%1+ 0]
4093    pmaddubsw            m4, [esp+%1+ 0]
4094    pmaddubsw            m1, [esp+%1+16]
4095    pmaddubsw            m5, [esp+%1+16]
4096    pmaddubsw            m2, [esp+%1+32]
4097    pmaddubsw            m6, [esp+%1+32]
4098    pmaddubsw            m3, [esp+%1+48]
4099    pmaddubsw            m7, [esp+%1+48]
4100    phaddw               m0, m1
4101    phaddw               m4, m5
4102    phaddw               m2, m3
4103    phaddw               m6, m7
4104    phaddw               m0, m2
4105    phaddw               m4, m6
4106    pmulhrsw             m0, m12
4107    pmulhrsw             m4, m12
4108  %if %2 != 0
4109    mova        [esp+%2+ 0], m0
4110    mova        [esp+%2+16], m4
4111  %endif
4112 %endmacro
4113%endif
4114
4115%macro MC_8TAP_SCALED 1
4116%ifidn %1, put
4117 %assign isprep 0
4118 %if ARCH_X86_64
4119  %if required_stack_alignment <= STACK_ALIGNMENT
4120cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
4121  %else
4122cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
4123  %endif
4124 %else ; ARCH_X86_32
4125  %if required_stack_alignment <= STACK_ALIGNMENT
4126cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy
4127  %else
4128cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy
4129  %endif
4130 %endif
4131 %xdefine base_reg r12
4132 %define rndshift 10
4133%else ; prep
4134 %assign isprep 1
4135 %if ARCH_X86_64
4136  %if required_stack_alignment <= STACK_ALIGNMENT
4137cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
4138   %xdefine tmp_stridem r14q
4139  %else
4140cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
4141   %define tmp_stridem qword [rsp+0x138]
4142  %endif
4143  %xdefine base_reg r11
4144 %else ; ARCH_X86_32
4145  %if required_stack_alignment <= STACK_ALIGNMENT
4146cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
4147  %else
4148cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
4149  %endif
4150  %define tmp_stridem dword [esp+0x138]
4151 %endif
4152 %define rndshift 6
4153%endif
4154%if ARCH_X86_32
4155    mov         [esp+0x1f0], t0d
4156    mov         [esp+0x1f4], t1d
4157 %if !isprep && required_stack_alignment > STACK_ALIGNMENT
4158    mov                dstd, dstm
4159    mov                 dsd, dsm
4160    mov                srcd, srcm
4161    mov                 ssd, ssm
4162    mov                  hd, hm
4163    mov                  r4, mxm
4164  %define r0m  [esp+0x200]
4165  %define dsm  [esp+0x204]
4166  %define dsmp dsm
4167  %define r1m  dsm
4168  %define r2m  [esp+0x208]
4169  %define ssm  [esp+0x20c]
4170  %define r3m  ssm
4171  %define hm   [esp+0x210]
4172  %define mxm  [esp+0x214]
4173    mov                 r0m, dstd
4174    mov                 dsm, dsd
4175    mov                 r2m, srcd
4176    mov                 ssm, ssd
4177    mov                  hm, hd
4178    mov                  r0, mym
4179    mov                  r1, dxm
4180    mov                  r2, dym
4181  %define mym [esp+0x218]
4182  %define dxm [esp+0x09c]
4183  %define dym [esp+0x21c]
4184    mov                 mxm, r4
4185    mov                 mym, r0
4186    mov                 dxm, r1
4187    mov                 dym, r2
4188    tzcnt                wd, wm
4189 %endif
4190 %if isprep && required_stack_alignment > STACK_ALIGNMENT
4191  %xdefine base_reg r5
4192 %else
4193  %xdefine base_reg r6
4194 %endif
4195    mov                 ssd, ssm
4196%endif
4197    LEA            base_reg, %1_8tap_scaled_8bpc_ssse3
4198%xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3
4199%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
4200    tzcnt                wd, wm
4201%endif
4202%if ARCH_X86_32
4203 %define m8  m0
4204 %define m9  m1
4205 %define m14 m4
4206 %define m15 m3
4207%endif
4208    movd                 m8, dxm
4209    movd                m14, mxm
4210    pshufd               m8, m8, q0000
4211    pshufd              m14, m14, q0000
4212%if isprep && UNIX64
4213    mov                 r5d, t0d
4214 DECLARE_REG_TMP 5, 7
4215%endif
4216%if ARCH_X86_64
4217    mov                 dyd, dym
4218%endif
4219%ifidn %1, put
4220 %if WIN64
4221    mov                 r8d, hm
4222  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
4223  %define hm r5m
4224  %define dxm r8m
4225 %elif ARCH_X86_64
4226  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
4227  %define hm r6m
4228 %endif
4229 %if ARCH_X86_64
4230  %if required_stack_alignment > STACK_ALIGNMENT
4231   %define dsm [rsp+0x138]
4232   %define rX r1
4233   %define rXd r1d
4234  %else
4235   %define dsm dsq
4236   %define rX r14
4237   %define rXd r14d
4238  %endif
4239 %else
4240  %define rX r1
4241 %endif
4242%else ; prep
4243 %if WIN64
4244    mov                 r7d, hm
4245  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
4246  %define hm r4m
4247  %define dxm r7m
4248 %elif ARCH_X86_64
4249  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
4250  %define hm [rsp+0x94]
4251 %endif
4252 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4253 %if ARCH_X86_64
4254  %define rX r14
4255  %define rXd r14d
4256 %else
4257  %define rX r3
4258 %endif
4259%endif
4260%if ARCH_X86_64
4261    mova                m10, [base+pd_0x3ff]
4262    mova                m12, [base+pw_8192]
4263 %ifidn %1, put
4264    mova                m13, [base+pd_512]
4265 %else
4266    mova                m13, [base+pd_32]
4267 %endif
4268%else
4269 %define m10 [base+pd_0x3ff]
4270 %define m12 [base+pw_8192]
4271 %ifidn %1, put
4272  %define m13 [base+pd_512]
4273 %else
4274  %define m13 [base+pd_32]
4275 %endif
4276%endif
4277    pxor                 m9, m9
4278%if ARCH_X86_64
4279    lea                ss3q, [ssq*3]
4280    movzx               r7d, t1b
4281    shr                 t1d, 16
4282    cmp                  hd, 6
4283    cmovs               t1d, r7d
4284    sub                srcq, ss3q
4285%else
4286 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
4287    mov                  r1, [esp+0x1f4]
4288    lea                  r0, [ssq*3]
4289    movzx                r2, r1b
4290    shr                  r1, 16
4291    cmp            dword hm, 6
4292    cmovs                r1, r2
4293    mov         [esp+0x1f4], r1
4294    mov                  r1, r1m
4295    mov                  r2, r2m
4296    sub                srcq, r0
4297 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4298 %define ss3q r0
4299 %define myd r4
4300 %define dyd dword dym
4301 %define hd  dword hm
4302%endif
4303    cmp                 dyd, 1024
4304    je .dy1
4305    cmp                 dyd, 2048
4306    je .dy2
4307    movzx                wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
4308    add                  wq, base_reg
4309    jmp                  wq
4310%ifidn %1, put
4311.w2:
4312 %if ARCH_X86_64
4313    mov                 myd, mym
4314    movzx               t0d, t0b
4315    dec                srcq
4316    movd                m15, t0d
4317 %else
4318    movzx                r4, byte [esp+0x1f0]
4319    dec                srcq
4320    movd                m15, r4
4321 %endif
4322    punpckldq            m9, m8
4323    SWAP                 m8, m9
4324    paddd               m14, m8 ; mx+dx*[0-1]
4325 %if ARCH_X86_64
4326    mova                m11, [base+pd_0x4000]
4327 %else
4328  %define m11 [base+pd_0x4000]
4329 %endif
4330    pshufd              m15, m15, q0000
4331    pand                 m8, m14, m10
4332    psrld                m8, 6
4333    paddd               m15, m8
4334    movd                r4d, m15
4335    psrldq              m15, 4
4336 %if ARCH_X86_64
4337    movd                r6d, m15
4338 %else
4339    movd                r3d, m15
4340 %endif
4341    mova                 m5, [base+bdct_lb_dw]
4342    mova                 m6, [base+subpel_s_shuf2]
4343    movd                m15, [base+subpel_filters+r4*8+2]
4344 %if ARCH_X86_64
4345    movd                 m7, [base+subpel_filters+r6*8+2]
4346 %else
4347    movd                 m7, [base+subpel_filters+r3*8+2]
4348 %endif
4349    pxor                 m9, m9
4350    pcmpeqd              m8, m9
4351    psrld               m14, 10
4352 %if ARCH_X86_32
4353    mov                  r3, r3m
4354    pshufb              m14, m5
4355    paddb               m14, m6
4356    mova        [rsp+0x180], m14
4357    SWAP                 m5, m0
4358    SWAP                 m6, m3
4359  %define m8  m5
4360  %define m15 m6
4361 %endif
4362    movq                 m0, [srcq+ssq*0]
4363    movq                 m2, [srcq+ssq*2]
4364    movhps               m0, [srcq+ssq*1]
4365    movhps               m2, [srcq+ss3q ]
4366    lea                srcq, [srcq+ssq*4]
4367 %if ARCH_X86_64
4368    pshufb              m14, m5
4369    paddb               m14, m6
4370 %endif
4371    movq                 m1, [srcq+ssq*0]
4372    movq                 m3, [srcq+ssq*2]
4373    movhps               m1, [srcq+ssq*1]
4374    movhps               m3, [srcq+ss3q ]
4375    lea                srcq, [srcq+ssq*4]
4376    punpckldq           m15, m7
4377    punpcklqdq          m15, m15
4378 %if ARCH_X86_64
4379    pand                m11, m8
4380    pandn                m8, m15
4381    SWAP                m15, m8
4382    por                 m15, m11
4383 %else
4384    pand                 m7, m8, m11
4385    pandn                m8, m15
4386  %define m8  m6
4387  %define m15 m5
4388    por                 m15, m7
4389    mova        [rsp+0x190], m15
4390 %endif
4391    pshufb               m0, m14
4392    pshufb               m2, m14
4393    pshufb               m1, m14
4394    pshufb               m3, m14
4395    pmaddubsw            m0, m15
4396    pmaddubsw            m2, m15
4397    pmaddubsw            m1, m15
4398    pmaddubsw            m3, m15
4399    phaddw               m0, m2
4400    phaddw               m1, m3
4401    pmulhrsw             m0, m12       ; 0 1 2 3
4402    pmulhrsw             m1, m12       ; 4 5 6 7
4403    palignr              m2, m1, m0, 4 ; 1 2 3 4
4404    punpcklwd            m3, m0, m2    ; 01 12
4405    punpckhwd            m0, m2        ; 23 34
4406    pshufd               m5, m1, q0321 ; 5 6 7 _
4407    punpcklwd            m2, m1, m5    ; 45 56
4408    punpckhwd            m4, m1, m5    ; 67 __
4409 %if ARCH_X86_32
4410    mov                 myd, mym
4411    mov                  r0, r0m
4412    mova        [rsp+0x1a0], m3
4413    mova        [rsp+0x1b0], m0
4414    mova        [rsp+0x1c0], m2
4415    mova        [rsp+0x1d0], m4
4416 %endif
4417.w2_loop:
4418    and                 myd, 0x3ff
4419 %if ARCH_X86_64
4420    mov                 r6d, 64 << 24
4421    mov                 r4d, myd
4422    shr                 r4d, 6
4423    lea                 r4d, [t1+r4]
4424    cmovnz              r6q, [base+subpel_filters+r4*8]
4425    movq                m11, r6q
4426    punpcklbw           m11, m11
4427    psraw               m11, 8
4428    pshufd               m8, m11, q0000
4429    pshufd               m9, m11, q1111
4430    pshufd              m10, m11, q2222
4431    pshufd              m11, m11, q3333
4432    pmaddwd              m5, m3, m8
4433    pmaddwd              m6, m0, m9
4434    pmaddwd              m7, m2, m10
4435    pmaddwd              m8, m4, m11
4436    paddd                m5, m6
4437    paddd                m7, m8
4438 %else
4439    mov                 mym, myd
4440    mov                  r1, [esp+0x1f4]
4441    xor                  r3, r3
4442    shr                  r4, 6
4443    lea                  r1, [r1+r4]
4444    mov                  r4, 64 << 24
4445    cmovnz               r4, [base+subpel_filters+r1*8+0]
4446    cmovnz               r3, [base+subpel_filters+r1*8+4]
4447    movd                 m7, r4
4448    movd                 m6, r3
4449    punpckldq            m7, m6
4450    punpcklbw            m7, m7
4451    psraw                m7, 8
4452    pshufd               m5, m7, q0000
4453    pshufd               m6, m7, q1111
4454    pmaddwd              m3, m5
4455    pmaddwd              m0, m6
4456    pshufd               m5, m7, q2222
4457    pshufd               m7, m7, q3333
4458    pmaddwd              m2, m5
4459    pmaddwd              m4, m7
4460    paddd                m3, m0
4461    paddd                m2, m4
4462    SWAP                 m5, m3
4463    SWAP                 m7, m2
4464 %endif
4465    paddd                m5, m13
4466    paddd                m5, m7
4467    psrad                m5, 10
4468    packssdw             m5, m5
4469    packuswb             m5, m5
4470 %if ARCH_X86_64
4471    pextrw              r6d, m5, 0
4472    mov              [dstq], r6w
4473    add                dstq, dsq
4474    dec                  hd
4475    jz .ret
4476    add                 myd, dyd
4477 %else
4478    pextrw              r3d, m5, 0
4479    mov              [dstq], r3w
4480    add                dstq, dsm
4481    dec                  hd
4482    jz .ret
4483    mov                 myd, mym
4484    add                 myd, dym
4485 %endif
4486    test                myd, ~0x3ff
4487 %if ARCH_X86_32
4488    SWAP                 m3, m5
4489    SWAP                 m2, m7
4490    mova                 m3, [rsp+0x1a0]
4491    mova                 m0, [rsp+0x1b0]
4492    mova                 m2, [rsp+0x1c0]
4493    mova                 m4, [rsp+0x1d0]
4494  %define m14 [esp+0x180]
4495  %define m15 [esp+0x190]
4496 %endif
4497    jz .w2_loop
4498 %if ARCH_X86_32
4499    mov                  r3, r3m
4500 %endif
4501    movq                 m5, [srcq]
4502    test                myd, 0x400
4503    jz .w2_skip_line
4504    add                srcq, ssq
4505    shufps               m3, m0, q1032      ; 01 12
4506    shufps               m0, m2, q1032      ; 23 34
4507    shufps               m2, m4, q1032      ; 45 56
4508    pshufb               m5, m14
4509    pmaddubsw            m5, m15
4510    phaddw               m5, m5
4511    pmulhrsw             m5, m12
4512    palignr              m4, m5, m1, 12
4513    punpcklqdq           m1, m4, m4         ; 6 7 6 7
4514    punpcklwd            m4, m1, m5         ; 67 __
4515 %if ARCH_X86_32
4516    mova        [rsp+0x1a0], m3
4517    mova        [rsp+0x1b0], m0
4518    mova        [rsp+0x1c0], m2
4519    mova        [rsp+0x1d0], m4
4520 %endif
4521    jmp .w2_loop
4522.w2_skip_line:
4523    movhps               m5, [srcq+ssq*1]
4524    lea                srcq, [srcq+ssq*2]
4525    mova                 m3, m0             ; 01 12
4526    mova                 m0, m2             ; 23 34
4527    pshufb               m5, m14
4528    pmaddubsw            m5, m15
4529    phaddw               m5, m5
4530    pmulhrsw             m5, m12            ; 6 7 6 7
4531    palignr              m4, m5, m1, 8      ; 4 5 6 7
4532    pshufd               m5, m4, q0321      ; 5 6 7 _
4533    mova                 m1, m4
4534    punpcklwd            m2, m4, m5         ; 45 56
4535    punpckhwd            m4, m5             ; 67 __
4536 %if ARCH_X86_32
4537    mova        [rsp+0x1a0], m3
4538    mova        [rsp+0x1b0], m0
4539    mova        [rsp+0x1c0], m2
4540    mova        [rsp+0x1d0], m4
4541 %endif
4542    jmp .w2_loop
4543%endif
4544INIT_XMM ssse3
4545.w4:
4546%if ARCH_X86_64
4547    mov                 myd, mym
4548    movzx               t0d, t0b
4549    dec                srcq
4550    movd                m15, t0d
4551%else
4552 %define m8  m0
4553 %xdefine m14 m4
4554 %define m15 m3
4555    movzx                r4, byte [esp+0x1f0]
4556    dec                srcq
4557    movd                m15, r4
4558%endif
4559    pmaddwd              m8, [base+rescale_mul]
4560%if ARCH_X86_64
4561    mova                m11, [base+pd_0x4000]
4562%else
4563  %define m11 [base+pd_0x4000]
4564%endif
4565    pshufd              m15, m15, q0000
4566    paddd               m14, m8 ; mx+dx*[0-3]
4567    pand                 m0, m14, m10
4568    psrld                m0, 6
4569    paddd               m15, m0
4570    psrldq               m7, m15, 8
4571%if ARCH_X86_64
4572    movd                r4d, m15
4573    movd               r11d, m7
4574    psrldq              m15, 4
4575    psrldq               m7, 4
4576    movd                r6d, m15
4577    movd               r13d, m7
4578    movd                m15, [base+subpel_filters+ r4*8+2]
4579    movd                 m2, [base+subpel_filters+r11*8+2]
4580    movd                 m3, [base+subpel_filters+ r6*8+2]
4581    movd                 m4, [base+subpel_filters+r13*8+2]
4582%else
4583    movd                 r0, m15
4584    movd                 rX, m7
4585    psrldq              m15, 4
4586    psrldq               m7, 4
4587    movd                 r4, m15
4588    movd                 r5, m7
4589    movd                 m1, [base+subpel_filters+r0*8+2]
4590    movd                 m2, [base+subpel_filters+rX*8+2]
4591    movd                 m3, [base+subpel_filters+r4*8+2]
4592    movd                 m7, [base+subpel_filters+r5*8+2]
4593    movifprep            r3, r3m
4594    SWAP                 m4, m7
4595 %define m15 m1
4596%endif
4597    mova                 m5, [base+bdct_lb_dw]
4598    movq                 m6, [base+subpel_s_shuf2]
4599    psrld               m14, 10
4600    punpckldq           m15, m3
4601    punpckldq            m2, m4
4602    punpcklqdq          m15, m2
4603    punpcklqdq           m6, m6
4604    pshufb              m14, m5
4605    paddb               m14, m6
4606%if ARCH_X86_64
4607    pcmpeqd              m0, m9
4608    pand                m11, m0
4609%else
4610    mova        [esp+0x180], m14
4611    SWAP                 m7, m4
4612    pxor                 m3, m3
4613    pcmpeqd              m0, m3
4614    pand                 m2, m11, m0
4615 %define m11 m2
4616%endif
4617    pandn                m0, m15
4618%if ARCH_X86_64
4619    SWAP                m15, m0
4620%else
4621 %define m15 m0
4622%endif
4623    por                 m15, m11
4624%if ARCH_X86_64
4625    movu                 m7, [srcq+ssq*0]
4626    movu                 m9, [srcq+ssq*1]
4627    movu                 m8, [srcq+ssq*2]
4628    movu                m10, [srcq+ss3q ]
4629    lea                srcq, [srcq+ssq*4]
4630    movu                 m2, [srcq+ssq*0]
4631    movu                 m4, [srcq+ssq*1]
4632    movu                 m3, [srcq+ssq*2]
4633    movu                 m5, [srcq+ss3q ]
4634    lea                srcq, [srcq+ssq*4]
4635    pshufb               m7, m14
4636    pshufb               m9, m14
4637    pshufb               m8, m14
4638    pshufb              m10, m14
4639    pshufb               m2, m14
4640    pshufb               m4, m14
4641    pshufb               m3, m14
4642    pshufb               m5, m14
4643    pmaddubsw            m7, m15
4644    pmaddubsw            m9, m15
4645    pmaddubsw            m8, m15
4646    pmaddubsw           m10, m15
4647    pmaddubsw            m2, m15
4648    pmaddubsw            m4, m15
4649    pmaddubsw            m3, m15
4650    pmaddubsw            m5, m15
4651    phaddw               m7, m9
4652    phaddw               m8, m10
4653    phaddw               m9, m2, m4
4654    phaddw               m3, m5
4655    pmulhrsw             m7, m12            ; 0 1
4656    pmulhrsw             m8, m12            ; 2 3
4657    pmulhrsw             m9, m12            ; 4 5
4658    pmulhrsw             m3, m12            ; 6 7
4659    shufps               m4, m7, m8, q1032  ; 1 2
4660    shufps               m5, m8, m9, q1032  ; 3 4
4661    shufps               m6, m9, m3, q1032  ; 5 6
4662    psrldq              m11, m3, 8          ; 7 _
4663    punpcklwd            m0, m7, m4 ; 01
4664    punpckhwd            m7, m4     ; 12
4665    punpcklwd            m1, m8, m5 ; 23
4666    punpckhwd            m8, m5     ; 34
4667    punpcklwd            m2, m9, m6 ; 45
4668    punpckhwd            m9, m6     ; 56
4669    punpcklwd            m3, m11    ; 67
4670    mova         [rsp+0x00], m7
4671    mova         [rsp+0x10], m8
4672    mova         [rsp+0x20], m9
4673%else
4674    mova        [esp+0x190], m15
4675    lea                ss3q, [ssq*3]
4676    movu                 m2, [srcq+ssq*0]
4677    movu                 m3, [srcq+ssq*1]
4678    movu                 m7, [srcq+ssq*2]
4679    movu                 m6, [srcq+ss3q ]
4680    lea                srcq, [srcq+ssq*4]
4681    pshufb               m2, m14
4682    pshufb               m3, m14
4683    pshufb               m7, m14
4684    pshufb               m6, m14
4685    pmaddubsw            m2, m15
4686    pmaddubsw            m3, m15
4687    pmaddubsw            m7, m15
4688    pmaddubsw            m6, m15
4689    phaddw               m2, m3
4690    phaddw               m7, m6
4691    movu                 m1, [srcq+ssq*0]
4692    movu                 m5, [srcq+ssq*1]
4693    movu                 m3, [srcq+ssq*2]
4694    movu                 m6, [srcq+ss3q ]
4695    lea                srcq, [srcq+ssq*4]
4696    pshufb               m1, m14
4697    pshufb               m5, m14
4698    pshufb               m3, m14
4699    pshufb               m6, m14
4700    pmaddubsw            m1, m15
4701    pmaddubsw            m5, m15
4702    pmaddubsw            m3, m15
4703    pmaddubsw            m6, m15
4704    phaddw               m1, m5
4705    phaddw               m3, m6
4706    pmulhrsw             m2, m12
4707    pmulhrsw             m7, m12
4708    pmulhrsw             m1, m12
4709    pmulhrsw             m3, m12
4710    shufps               m4, m2, m7, q1032  ; 1 2
4711    shufps               m5, m7, m1, q1032  ; 3 4
4712    shufps               m6, m1, m3, q1032  ; 5 6
4713    psrldq               m0, m3, 8          ; 7 _
4714    mova        [esp+0x1a0], m0
4715 %define m11 [esp+0x1a0]
4716    punpcklwd            m0, m2, m4      ; 01
4717    punpckhwd            m2, m4          ; 12
4718    punpcklwd            m4, m7, m5      ; 23
4719    punpckhwd            m7, m5          ; 34
4720    punpcklwd            m5, m1, m6      ; 45
4721    punpckhwd            m1, m6          ; 56
4722    punpcklwd            m3, [esp+0x1a0] ; 67
4723    mov                 myd, mym
4724    mov                  r0, r0m
4725    mova        [esp+0x1b0], m0 ; 01
4726    mova        [esp+0x1c0], m4 ; 23
4727    mova        [esp+0x1d0], m5 ; 45
4728    mova        [esp+0x1e0], m3 ; 67
4729    mova         [rsp+0x00], m2 ; 12
4730    mova         [rsp+0x10], m7 ; 34
4731    mova         [rsp+0x20], m1 ; 56
4732    SWAP                 m1, m4
4733    SWAP                 m2, m5
4734%endif
4735.w4_loop:
4736    and                 myd, 0x3ff
4737%if ARCH_X86_64
4738    mov                 r6d, 64 << 24
4739    mov                 r4d, myd
4740    shr                 r4d, 6
4741    lea                 r4d, [t1+r4]
4742    cmovnz              r6q, [base+subpel_filters+r4*8]
4743    movq                m10, r6q
4744    punpcklbw           m10, m10
4745    psraw               m10, 8
4746    pshufd               m7, m10, q0000
4747    pshufd               m8, m10, q1111
4748    pshufd               m9, m10, q2222
4749    pshufd              m10, m10, q3333
4750    pmaddwd              m4, m0, m7
4751    pmaddwd              m5, m1, m8
4752    pmaddwd              m6, m2, m9
4753    pmaddwd              m7, m3, m10
4754    paddd                m4, m5
4755    paddd                m6, m7
4756    paddd                m4, m13
4757    paddd                m4, m6
4758%else
4759    mov                 mym, myd
4760    mov                  r5, [esp+0x1f4]
4761    xor                  r3, r3
4762    shr                  r4, 6
4763    lea                  r5, [r5+r4]
4764    mov                  r4, 64 << 24
4765    cmovnz               r4, [base+subpel_filters+r5*8+0]
4766    cmovnz               r3, [base+subpel_filters+r5*8+4]
4767    movd                 m7, r4
4768    movd                 m6, r3
4769    punpckldq            m7, m6
4770    punpcklbw            m7, m7
4771    psraw                m7, 8
4772    pshufd               m4, m7, q0000
4773    pshufd               m5, m7, q1111
4774    pshufd               m6, m7, q2222
4775    pshufd               m7, m7, q3333
4776    pmaddwd              m0, m4
4777    pmaddwd              m1, m5
4778    pmaddwd              m2, m6
4779    pmaddwd              m3, m7
4780    paddd                m0, m1
4781    paddd                m2, m3
4782    paddd                m0, m13
4783    paddd                m0, m2
4784    SWAP                 m4, m0
4785%endif
4786    psrad                m4, rndshift
4787    packssdw             m4, m4
4788%ifidn %1, put
4789    packuswb             m4, m4
4790    movd             [dstq], m4
4791    add                dstq, dsmp
4792%else
4793    movq             [tmpq], m4
4794    add                tmpq, 8
4795%endif
4796    dec                  hd
4797    jz .ret
4798%if ARCH_X86_64
4799    add                 myd, dyd
4800    test                myd, ~0x3ff
4801    jz .w4_loop
4802%else
4803    SWAP                 m0, m4
4804    mov                 myd, mym
4805    mov                  r3, r3m
4806    add                 myd, dym
4807    test                myd, ~0x3ff
4808    jnz .w4_next_line
4809    mova                 m0, [esp+0x1b0]
4810    mova                 m1, [esp+0x1c0]
4811    mova                 m2, [esp+0x1d0]
4812    mova                 m3, [esp+0x1e0]
4813    jmp .w4_loop
4814.w4_next_line:
4815  %define m14 [esp+0x180]
4816  %define m15 [esp+0x190]
4817%endif
4818    movu                 m4, [srcq]
4819    test                myd, 0x400
4820    jz .w4_skip_line
4821%if ARCH_X86_64
4822    mova                 m0, [rsp+0x00]
4823    mova         [rsp+0x00], m1
4824    mova                 m1, [rsp+0x10]
4825    mova         [rsp+0x10], m2
4826    mova                 m2, [rsp+0x20]
4827    mova         [rsp+0x20], m3
4828%else
4829    mova                 m5, [esp+0x1c0]
4830    mova                 m0, [rsp+0x000]
4831    mova         [rsp+0x00], m5
4832    mova        [esp+0x1b0], m0
4833    mova                 m6, [esp+0x1d0]
4834    mova                 m1, [rsp+0x010]
4835    mova         [rsp+0x10], m6
4836    mova        [esp+0x1c0], m1
4837    mova                 m7, [esp+0x1e0]
4838    mova                 m2, [rsp+0x020]
4839    mova         [rsp+0x20], m7
4840    mova        [esp+0x1d0], m2
4841%endif
4842    pshufb               m4, m14
4843    pmaddubsw            m4, m15
4844    phaddw               m4, m4
4845    pmulhrsw             m4, m12
4846    punpcklwd            m3, m11, m4
4847%if ARCH_X86_32
4848    mova        [esp+0x1e0], m3
4849%endif
4850    mova                m11, m4
4851    add                srcq, ssq
4852    jmp .w4_loop
4853.w4_skip_line:
4854%if ARCH_X86_32
4855    mova                 m0, [esp+0x1c0]
4856    mova                 m1, [esp+0x1d0]
4857    mova                 m2, [esp+0x1e0]
4858%endif
4859    movu                 m5, [srcq+ssq*1]
4860    lea                srcq, [srcq+ssq*2]
4861    mova                 m6, [rsp+0x10]
4862    mova                 m7, [rsp+0x20]
4863    pshufb               m4, m14
4864    pshufb               m5, m14
4865    pmaddubsw            m4, m15
4866    pmaddubsw            m5, m15
4867    phaddw               m4, m5
4868    pmulhrsw             m4, m12
4869    punpcklwd            m5, m11, m4
4870    mova         [rsp+0x00], m6
4871    mova         [rsp+0x10], m7
4872    mova         [rsp+0x20], m5
4873%if ARCH_X86_64
4874    psrldq              m11, m4, 8
4875    mova                 m0, m1
4876    mova                 m1, m2
4877    mova                 m2, m3
4878    punpcklwd            m3, m4, m11
4879%else
4880    psrldq               m6, m4, 8
4881    punpcklwd            m3, m4, m6
4882    mova        [esp+0x1a0], m6
4883    mova        [esp+0x1b0], m0
4884    mova        [esp+0x1c0], m1
4885    mova        [esp+0x1d0], m2
4886    mova        [esp+0x1e0], m3
4887%endif
4888    jmp .w4_loop
4889INIT_XMM ssse3
4890.w8:
4891    mov    dword [rsp+0x90], 1
4892    movifprep   tmp_stridem, 16
4893    jmp .w_start
4894.w16:
4895    mov    dword [rsp+0x90], 2
4896    movifprep   tmp_stridem, 32
4897    jmp .w_start
4898.w32:
4899    mov    dword [rsp+0x90], 4
4900    movifprep   tmp_stridem, 64
4901    jmp .w_start
4902.w64:
4903    mov    dword [rsp+0x90], 8
4904    movifprep   tmp_stridem, 128
4905    jmp .w_start
4906.w128:
4907    mov    dword [rsp+0x90], 16
4908    movifprep   tmp_stridem, 256
4909.w_start:
4910%ifidn %1, put
4911    movifnidn           dsm, dsq
4912%endif
4913%if ARCH_X86_64
4914    shr                 t0d, 16
4915    movd                m15, t0d
4916%else
4917 %define m8  m0
4918 %xdefine m14 m4
4919 %define m15 m3
4920 %if isprep
4921  %define ssq ssm
4922 %endif
4923    mov                  r4, [esp+0x1f0]
4924    shr                  r4, 16
4925    movd                m15, r4
4926    mov                  r0, r0m
4927    mov                 myd, mym
4928%endif
4929    sub                srcq, 3
4930    pslld                m7, m8, 2 ; dx*4
4931    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
4932    pshufd              m15, m15, q0000
4933    paddd               m14, m8 ; mx+dx*[0-3]
4934    mova        [rsp+0x100], m7
4935    mova        [rsp+0x120], m15
4936    mov         [rsp+0x098], srcq
4937    mov         [rsp+0x130], r0q ; dstq / tmpq
4938%if ARCH_X86_64 && UNIX64
4939    mov                  hm, hd
4940%elif ARCH_X86_32
4941    mov                  r5, hm
4942    mov         [esp+0x094], myd
4943    mov         [esp+0x134], r5
4944%endif
4945    jmp .hloop
4946.hloop_prep:
4947    dec   dword [rsp+0x090]
4948    jz .ret
4949%if ARCH_X86_64
4950    add   qword [rsp+0x130], 8*(isprep+1)
4951    mov                  hd, hm
4952%else
4953    add   dword [esp+0x130], 8*(isprep+1)
4954    mov                 myd, [esp+0x094]
4955    mov                  r5, [esp+0x134]
4956    mov                  r0, [esp+0x130]
4957%endif
4958    mova                 m7, [rsp+0x100]
4959    mova                m14, [rsp+0x110]
4960%if ARCH_X86_64
4961    mova                m10, [base+pd_0x3ff]
4962%endif
4963    mova                m15, [rsp+0x120]
4964    pxor                 m9, m9
4965    mov                srcq, [rsp+0x098]
4966%if ARCH_X86_64
4967    mov                 r0q, [rsp+0x130] ; dstq / tmpq
4968%else
4969    mov                 mym, myd
4970    mov                  hm, r5
4971    mov                 r0m, r0
4972    mov                  r3, r3m
4973%endif
4974    paddd               m14, m7
4975.hloop:
4976%if ARCH_X86_64
4977    mova                m11, [base+pq_0x40000000]
4978%else
4979 %define m11 [base+pq_0x40000000]
4980%endif
4981    psrld                m2, m14, 10
4982    mova              [rsp], m2
4983    pand                 m6, m14, m10
4984    psrld                m6, 6
4985    paddd                m5, m15, m6
4986    pcmpeqd              m6, m9
4987    psrldq               m2, m5, 8
4988%if ARCH_X86_64
4989    movd                r4d, m5
4990    movd                r6d, m2
4991    psrldq               m5, 4
4992    psrldq               m2, 4
4993    movd                r7d, m5
4994    movd                r9d, m2
4995    movq                 m0, [base+subpel_filters+r4*8]
4996    movq                 m1, [base+subpel_filters+r6*8]
4997    movhps               m0, [base+subpel_filters+r7*8]
4998    movhps               m1, [base+subpel_filters+r9*8]
4999%else
5000    movd                 r0, m5
5001    movd                 rX, m2
5002    psrldq               m5, 4
5003    psrldq               m2, 4
5004    movd                 r4, m5
5005    movd                 r5, m2
5006    movq                 m0, [base+subpel_filters+r0*8]
5007    movq                 m1, [base+subpel_filters+rX*8]
5008    movhps               m0, [base+subpel_filters+r4*8]
5009    movhps               m1, [base+subpel_filters+r5*8]
5010    pxor                 m2, m2
5011 %define m9 m2
5012%endif
5013    paddd               m14, m7 ; mx+dx*[4-7]
5014    pand                 m5, m14, m10
5015    psrld                m5, 6
5016    paddd               m15, m5
5017    pcmpeqd              m5, m9
5018    mova        [rsp+0x110], m14
5019    psrldq               m4, m15, 8
5020%if ARCH_X86_64
5021    movd               r10d, m15
5022    movd               r11d, m4
5023    psrldq              m15, 4
5024    psrldq               m4, 4
5025    movd               r13d, m15
5026    movd                rXd, m4
5027    movq                 m2, [base+subpel_filters+r10*8]
5028    movq                 m3, [base+subpel_filters+r11*8]
5029    movhps               m2, [base+subpel_filters+r13*8]
5030    movhps               m3, [base+subpel_filters+ rX*8]
5031    psrld               m14, 10
5032    psrldq               m4, m14, 8
5033    movd               r10d, m14
5034    movd               r11d, m4
5035    psrldq              m14, 4
5036    psrldq               m4, 4
5037    movd               r13d, m14
5038    movd                rXd, m4
5039    mov                 r4d, [rsp+ 0]
5040    mov                 r6d, [rsp+ 8]
5041    mov                 r7d, [rsp+ 4]
5042    mov                 r9d, [rsp+12]
5043    pshufd               m4, m6, q1100
5044    pshufd               m6, m6, q3322
5045    pshufd              m14, m5, q1100
5046    pshufd               m5, m5, q3322
5047    pand                 m7, m11, m4
5048    pand                 m8, m11, m6
5049    pand                m15, m11, m14
5050    pand                m11, m11, m5
5051    pandn                m4, m0
5052    pandn                m6, m1
5053    pandn               m14, m2
5054    pandn                m5, m3
5055    por                  m7, m4
5056    por                  m8, m6
5057    por                 m15, m14
5058    por                 m11, m5
5059    mova         [rsp+0x10], m7
5060    mova         [rsp+0x20], m8
5061    mova         [rsp+0x30], m15
5062    mova         [rsp+0x40], m11
5063    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1
5064    mova         [rsp+0x50], m1
5065    mova         [rsp+0x60], m2
5066    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3
5067    mova         [rsp+0x70], m3
5068    mova         [rsp+0x80], m4
5069    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5
5070    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7
5071    SWAP                 m7, m0
5072    SWAP                 m8, m14
5073    mova                 m1, [rsp+0x50]
5074    mova                 m2, [rsp+0x60]
5075    mova                 m3, [rsp+0x70]
5076    mova                 m9, [rsp+0x80]
5077    mov                 myd, mym
5078    mov                 dyd, dym
5079    punpcklwd            m4, m5, m6 ; 45a
5080    punpckhwd            m5, m6     ; 45b
5081    punpcklwd            m6, m7, m8 ; 67a
5082    punpckhwd            m7, m8     ; 67b
5083    punpcklwd            m0, m1, m2 ; 01a
5084    punpckhwd            m1, m2     ; 01b
5085    punpcklwd            m2, m3, m9 ; 23a
5086    punpckhwd            m3, m9     ; 23b
5087    mova         [rsp+0x50], m4
5088    mova         [rsp+0x60], m5
5089    mova         [rsp+0x70], m6
5090    mova         [rsp+0x80], m7
5091    SWAP                m14, m8
5092.vloop:
5093    and                 myd, 0x3ff
5094    mov                 r6d, 64 << 24
5095    mov                 r4d, myd
5096    shr                 r4d, 6
5097    lea                 r4d, [t1+r4]
5098    cmovnz              r6q, [base+subpel_filters+r4*8]
5099    movq                m11, r6q
5100    punpcklbw           m11, m11
5101    psraw               m11, 8
5102    pshufd               m5, m11, q0000
5103    pshufd               m7, m11, q1111
5104    pshufd              m10, m11, q2222
5105    pshufd              m11, m11, q3333
5106    pmaddwd              m4, m5, m0
5107    pmaddwd              m5, m5, m1
5108    pmaddwd              m6, m7, m2
5109    pmaddwd              m7, m7, m3
5110    paddd                m4, m13
5111    paddd                m5, m13
5112    paddd                m4, m6
5113    paddd                m5, m7
5114    pmaddwd              m6, [rsp+0x50], m10
5115    pmaddwd              m7, [rsp+0x60], m10
5116    pmaddwd              m8, [rsp+0x70], m11
5117    pmaddwd              m9, [rsp+0x80], m11
5118    paddd                m4, m6
5119    paddd                m5, m7
5120    paddd                m4, m8
5121    paddd                m5, m9
5122%else
5123    movd                 r0, m15
5124    movd                 rX, m4
5125    psrldq              m15, 4
5126    psrldq               m4, 4
5127    movd                 r4, m15
5128    movd                 r5, m4
5129    mova                m14, [esp+0x110]
5130    movq                 m2, [base+subpel_filters+r0*8]
5131    movq                 m3, [base+subpel_filters+rX*8]
5132    movhps               m2, [base+subpel_filters+r4*8]
5133    movhps               m3, [base+subpel_filters+r5*8]
5134    psrld               m14, 10
5135    mova           [esp+16], m14
5136    mov                  r0, [esp+ 0]
5137    mov                  rX, [esp+ 8]
5138    mov                  r4, [esp+ 4]
5139    mov                  r5, [esp+12]
5140    mova         [esp+0x20], m0
5141    mova         [esp+0x30], m1
5142    mova         [esp+0x40], m2
5143    mova         [esp+0x50], m3
5144    pshufd               m4, m6, q1100
5145    pshufd               m6, m6, q3322
5146    pshufd               m7, m5, q1100
5147    pshufd               m5, m5, q3322
5148    pand                 m0, m11, m4
5149    pand                 m1, m11, m6
5150    pand                 m2, m11, m7
5151    pand                 m3, m11, m5
5152    pandn                m4, [esp+0x20]
5153    pandn                m6, [esp+0x30]
5154    pandn                m7, [esp+0x40]
5155    pandn                m5, [esp+0x50]
5156    por                  m0, m4
5157    por                  m1, m6
5158    por                  m2, m7
5159    por                  m3, m5
5160    mova         [esp+0x20], m0
5161    mova         [esp+0x30], m1
5162    mova         [esp+0x40], m2
5163    mova         [esp+0x50], m3
5164    MC_8TAP_SCALED_H   0x20, 0x140, 0 ; 0-1
5165    MC_8TAP_SCALED_H   0x20, 0x160    ; 2-3
5166    MC_8TAP_SCALED_H   0x20, 0x180    ; 4-5
5167    MC_8TAP_SCALED_H   0x20, 0x1a0    ; 6-7
5168    mova                 m5, [esp+0x180]
5169    mova                 m6, [esp+0x190]
5170    mova                 m7, [esp+0x1a0]
5171    mova                 m0, [esp+0x1b0]
5172    mov                 myd, mym
5173    punpcklwd            m4, m5, m6      ; 45a
5174    punpckhwd            m5, m6          ; 45b
5175    punpcklwd            m6, m7, m0      ; 67a
5176    punpckhwd            m7, m0          ; 67b
5177    mova        [esp+0x180], m4
5178    mova        [esp+0x190], m5
5179    mova        [esp+0x1a0], m6
5180    mova        [esp+0x1b0], m7
5181    mova                 m1, [esp+0x140]
5182    mova                 m2, [esp+0x150]
5183    mova                 m3, [esp+0x160]
5184    mova                 m4, [esp+0x170]
5185    punpcklwd            m0, m1, m2      ; 01a
5186    punpckhwd            m1, m2          ; 01b
5187    punpcklwd            m2, m3, m4      ; 23a
5188    punpckhwd            m3, m4          ; 23b
5189    mova        [esp+0x140], m0
5190    mova        [esp+0x150], m1
5191    mova        [esp+0x160], m2
5192    mova        [esp+0x170], m3
5193.vloop:
5194    mov                  r0, r0m
5195    mov                  r5, [esp+0x1f4]
5196    and                 myd, 0x3ff
5197    mov                 mym, myd
5198    xor                  r3, r3
5199    shr                  r4, 6
5200    lea                  r5, [r5+r4]
5201    mov                  r4, 64 << 24
5202    cmovnz               r4, [base+subpel_filters+r5*8+0]
5203    cmovnz               r3, [base+subpel_filters+r5*8+4]
5204    movd                 m7, r4
5205    movd                 m6, r3
5206    punpckldq            m7, m6
5207    punpcklbw            m7, m7
5208    psraw                m7, 8
5209    pshufd               m4, m7, q0000
5210    pshufd               m5, m7, q1111
5211    pmaddwd              m0, m4
5212    pmaddwd              m1, m4
5213    pmaddwd              m2, m5
5214    pmaddwd              m3, m5
5215    pshufd               m6, m7, q2222
5216    pshufd               m7, m7, q3333
5217    paddd                m0, m2
5218    paddd                m1, m3
5219    pmaddwd              m2, [esp+0x180], m6
5220    pmaddwd              m3, [esp+0x190], m6
5221    pmaddwd              m4, [esp+0x1a0], m7
5222    pmaddwd              m5, [esp+0x1b0], m7
5223    paddd                m0, m2
5224    paddd                m1, m3
5225    paddd                m0, m13
5226    paddd                m1, m13
5227    paddd                m4, m0
5228    paddd                m5, m1
5229%endif
5230    psrad                m4, rndshift
5231    psrad                m5, rndshift
5232    packssdw             m4, m5
5233%ifidn %1, put
5234    packuswb             m4, m4
5235    movq             [dstq], m4
5236    add                dstq, dsm
5237%else
5238    mova             [tmpq], m4
5239    add                tmpq, tmp_stridem
5240%endif
5241    dec                  hd
5242    jz .hloop_prep
5243%if ARCH_X86_64
5244    add                 myd, dyd
5245    test                myd, ~0x3ff
5246    jz .vloop
5247    test                myd, 0x400
5248    mov         [rsp+0x140], myd
5249    mov                 r4d, [rsp+ 0]
5250    mov                 r6d, [rsp+ 8]
5251    mov                 r7d, [rsp+ 4]
5252    mov                 r9d, [rsp+12]
5253    jz .skip_line
5254    mova                m14, [base+unpckw]
5255    movq                 m6, [srcq+r10]
5256    movq                 m7, [srcq+r11]
5257    movhps               m6, [srcq+r13]
5258    movhps               m7, [srcq+ rX]
5259    movq                 m4, [srcq+ r4]
5260    movq                 m5, [srcq+ r6]
5261    movhps               m4, [srcq+ r7]
5262    movhps               m5, [srcq+ r9]
5263    add                srcq, ssq
5264    mov                 myd, [rsp+0x140]
5265    mov                 dyd, dym
5266    pshufd               m9, m14, q1032
5267    pshufb               m0, m14                ; 0a 1a
5268    pshufb               m1, m14                ; 0b 1b
5269    pshufb               m2, m9                 ; 3a 2a
5270    pshufb               m3, m9                 ; 3b 2b
5271    pmaddubsw            m6, [rsp+0x30]
5272    pmaddubsw            m7, [rsp+0x40]
5273    pmaddubsw            m4, [rsp+0x10]
5274    pmaddubsw            m5, [rsp+0x20]
5275    phaddw               m6, m7
5276    phaddw               m4, m5
5277    phaddw               m4, m6
5278    pmulhrsw             m4, m12
5279    pshufb               m5, [rsp+0x50], m14    ; 4a 5a
5280    pshufb               m6, [rsp+0x60], m14    ; 4b 5b
5281    pshufb               m7, [rsp+0x70], m9     ; 7a 6a
5282    pshufb               m8, [rsp+0x80], m9     ; 7b 6b
5283    punpckhwd            m0, m2 ; 12a
5284    punpckhwd            m1, m3 ; 12b
5285    punpcklwd            m2, m5 ; 34a
5286    punpcklwd            m3, m6 ; 34b
5287    punpckhwd            m5, m7 ; 56a
5288    punpckhwd            m6, m8 ; 56b
5289    punpcklwd            m7, m4 ; 78a
5290    punpckhqdq           m4, m4
5291    punpcklwd            m8, m4 ; 78b
5292    mova         [rsp+0x50], m5
5293    mova         [rsp+0x60], m6
5294    mova         [rsp+0x70], m7
5295    mova         [rsp+0x80], m8
5296    jmp .vloop
5297.skip_line:
5298    mova                 m0, [rsp+0x10]
5299    mova                 m1, [rsp+0x20]
5300    mova                m14, [rsp+0x30]
5301    mova                m15, [rsp+0x40]
5302    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15
5303    mov                 myd, [rsp+0x140]
5304    mov                 dyd, dym
5305    mova                 m0, m2         ; 01a
5306    mova                 m1, m3         ; 01b
5307    mova                 m2, [rsp+0x50] ; 23a
5308    mova                 m3, [rsp+0x60] ; 23b
5309    mova                 m5, [rsp+0x70] ; 45a
5310    mova                 m6, [rsp+0x80] ; 45b
5311    punpcklwd            m7, m4, m8     ; 67a
5312    punpckhwd            m4, m8         ; 67b
5313    mova         [rsp+0x50], m5
5314    mova         [rsp+0x60], m6
5315    mova         [rsp+0x70], m7
5316    mova         [rsp+0x80], m4
5317%else
5318    mov                 r0m, r0
5319    mov                 myd, mym
5320    mov                  r3, r3m
5321    add                 myd, dym
5322    test                myd, ~0x3ff
5323    mov                 mym, myd
5324    jnz .next_line
5325    mova                 m0, [esp+0x140]
5326    mova                 m1, [esp+0x150]
5327    mova                 m2, [esp+0x160]
5328    mova                 m3, [esp+0x170]
5329    jmp .vloop
5330.next_line:
5331    test                myd, 0x400
5332    mov                  r0, [esp+ 0]
5333    mov                  rX, [esp+ 8]
5334    mov                  r4, [esp+ 4]
5335    mov                  r5, [esp+12]
5336    jz .skip_line
5337    mova                 m6, [base+unpckw]
5338    mova                 m0, [esp+0x140]
5339    mova                 m1, [esp+0x150]
5340    mova                 m7, [esp+0x180]
5341    movq                 m4, [srcq+r0]
5342    movq                 m5, [srcq+rX]
5343    movhps               m4, [srcq+r4]
5344    movhps               m5, [srcq+r5]
5345    pshufb               m0, m6         ; 0a 1a
5346    pshufb               m1, m6         ; 0b 1b
5347    pshufb               m7, m6         ; 4a 5a
5348    mov                  r0, [esp+16]
5349    mov                  rX, [esp+24]
5350    mov                  r4, [esp+20]
5351    mov                  r5, [esp+28]
5352    movq                 m3, [srcq+r0]
5353    movq                 m2, [srcq+rX]
5354    movhps               m3, [srcq+r4]
5355    movhps               m2, [srcq+r5]
5356    add                srcq, ssq
5357    pmaddubsw            m4, [esp+0x20]
5358    pmaddubsw            m5, [esp+0x30]
5359    pmaddubsw            m3, [esp+0x40]
5360    pmaddubsw            m2, [esp+0x50]
5361    phaddw               m4, m5
5362    phaddw               m3, m2
5363    mova                 m5, [esp+0x190]
5364    mova                 m2, [esp+0x160]
5365    phaddw               m4, m3
5366    mova                 m3, [esp+0x170]
5367    pmulhrsw             m4, m12        ; 8a 8b
5368    mov                 myd, mym
5369    pshufb               m5, m6         ; 4b 5b
5370    pshufd               m6, m6, q1032
5371    pshufb               m2, m6         ; 3a 2a
5372    pshufb               m3, m6         ; 3b 2b
5373    punpckhwd            m0, m2         ; 12a
5374    punpckhwd            m1, m3         ; 12b
5375    mova        [esp+0x140], m0
5376    mova        [esp+0x150], m1
5377    mova                 m0, [esp+0x1a0]
5378    mova                 m1, [esp+0x1b0]
5379    punpcklwd            m2, m7         ; 34a
5380    punpcklwd            m3, m5         ; 34b
5381    mova        [esp+0x160], m2
5382    mova        [esp+0x170], m3
5383    pshufb               m0, m6         ; 7a 6a
5384    pshufb               m1, m6         ; 7b 6b
5385    punpckhwd            m7, m0         ; 56a
5386    punpckhwd            m5, m1         ; 56b
5387    punpcklwd            m0, m4
5388    punpckhqdq           m4, m4
5389    punpcklwd            m1, m4
5390    mova        [esp+0x180], m7
5391    mova        [esp+0x190], m5
5392    mova        [esp+0x1a0], m0
5393    mova        [esp+0x1b0], m1
5394    mova                 m0, [esp+0x140]
5395    mova                 m1, [esp+0x150]
5396    jmp .vloop
5397.skip_line:
5398    MC_8TAP_SCALED_H   0x20, 0x1c0, 0
5399    mov                 myd, mym
5400    mova                 m0, [esp+0x160]
5401    mova                 m1, [esp+0x170]
5402    mova                 m2, [esp+0x180]
5403    mova                 m3, [esp+0x190]
5404    mova         [esp+0x140], m0
5405    mova         [esp+0x150], m1
5406    mova                 m4, [esp+0x1a0]
5407    mova                 m5, [esp+0x1b0]
5408    mova        [esp+0x160], m2
5409    mova        [esp+0x170], m3
5410    mova                 m6, [esp+0x1c0]
5411    mova                 m7, [esp+0x1d0]
5412    mova        [esp+0x180], m4
5413    mova        [esp+0x190], m5
5414    punpcklwd            m4, m6, m7
5415    punpckhwd            m6, m7
5416    mova        [esp+0x1a0], m4
5417    mova        [esp+0x1b0], m6
5418%endif
5419    jmp .vloop
5420INIT_XMM ssse3
5421.dy1:
5422    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
5423    add                  wq, base_reg
5424    jmp                  wq
5425%ifidn %1, put
5426.dy1_w2:
5427 %if ARCH_X86_64
5428    mov                 myd, mym
5429    movzx               t0d, t0b
5430    dec                srcq
5431    movd                m15, t0d
5432 %else
5433  %define m8  m0
5434  %define m9  m1
5435  %define m14 m4
5436  %define m15 m3
5437    movzx                r5, byte [esp+0x1f0]
5438    dec                srcd
5439    movd                m15, r5
5440 %endif
5441    punpckldq            m9, m8
5442    SWAP                 m8, m9
5443    paddd               m14, m8 ; mx+dx*[0-1]
5444 %if ARCH_X86_64
5445    mova                m11, [base+pd_0x4000]
5446 %else
5447  %define m11 [base+pd_0x4000]
5448 %endif
5449    pshufd              m15, m15, q0000
5450    pand                 m8, m14, m10
5451    psrld                m8, 6
5452    paddd               m15, m8
5453    movd                r4d, m15
5454    psrldq              m15, 4
5455 %if ARCH_X86_64
5456    movd                r6d, m15
5457 %else
5458    movd                r3d, m15
5459 %endif
5460    mova                 m5, [base+bdct_lb_dw]
5461    mova                 m6, [base+subpel_s_shuf2]
5462    movd                m15, [base+subpel_filters+r4*8+2]
5463 %if ARCH_X86_64
5464    movd                 m7, [base+subpel_filters+r6*8+2]
5465 %else
5466    movd                 m7, [base+subpel_filters+r3*8+2]
5467 %endif
5468    pxor                 m9, m9
5469    pcmpeqd              m8, m9
5470    psrld               m14, 10
5471 %if ARCH_X86_32
5472    mov                  r3, r3m
5473    pshufb              m14, m5
5474    paddb               m14, m6
5475    mova         [esp+0x00], m14
5476  %define m14 [esp+0x00]
5477    SWAP                 m5, m0
5478    SWAP                 m6, m3
5479  %define m8  m5
5480  %define m15 m6
5481 %endif
5482    movq                 m0, [srcq+ssq*0]
5483    movq                 m2, [srcq+ssq*2]
5484    movhps               m0, [srcq+ssq*1]
5485    movhps               m2, [srcq+ss3q ]
5486    lea                srcq, [srcq+ssq*4]
5487 %if ARCH_X86_64
5488    shr                 myd, 6
5489    mov                 r4d, 64 << 24
5490    lea                 myd, [t1+myq]
5491    cmovnz              r4q, [base+subpel_filters+myq*8]
5492    pshufb              m14, m5
5493    paddb               m14, m6
5494    movq                m10, r4
5495 %else
5496    mov                 myd, mym
5497    mov                  r5, [esp+0x1f4]
5498    xor                  r3, r3
5499    shr                 myd, 6
5500    lea                  r5, [r5+myd]
5501    mov                  r4, 64 << 24
5502    cmovnz               r4, [base+subpel_filters+r5*8+0]
5503    cmovnz               r3, [base+subpel_filters+r5*8+4]
5504  %define m10 m4
5505    movd                m10, r4
5506    movd                 m3, r3
5507    mov                  r3, r3m
5508    punpckldq           m10, m3
5509 %endif
5510    movq                 m1, [srcq+ssq*0]
5511    movq                 m3, [srcq+ssq*2]
5512    movhps               m1, [srcq+ssq*1]
5513    add                srcq, ss3q
5514    punpcklbw           m10, m10
5515    psraw               m10, 8
5516    punpckldq           m15, m7
5517    punpcklqdq          m15, m15
5518 %if ARCH_X86_64
5519    pand                m11, m8
5520 %else
5521    pand                 m7, m11, m8
5522  %define m11 m7
5523 %endif
5524    pandn                m8, m15
5525    SWAP                m15, m8
5526    por                 m15, m11
5527 %if ARCH_X86_64
5528    pshufd               m8, m10, q0000
5529    pshufd               m9, m10, q1111
5530    pshufd              m11, m10, q3333
5531    pshufd              m10, m10, q2222
5532 %else
5533    mova         [esp+0x10], m15
5534  %define m15 [esp+0x10]
5535    mov                  r0, r0m
5536    pshufd               m5, m4, q0000
5537    pshufd               m6, m4, q1111
5538    pshufd               m7, m4, q2222
5539    pshufd               m4, m4, q3333
5540  %define m8  [esp+0x20]
5541  %define m9  [esp+0x30]
5542  %define m10 [esp+0x40]
5543  %define m11 [esp+0x50]
5544    mova                 m8, m5
5545    mova                 m9, m6
5546    mova                m10, m7
5547    mova                m11, m4
5548 %endif
5549    pshufb               m0, m14
5550    pshufb               m2, m14
5551    pshufb               m1, m14
5552    pshufb               m3, m14
5553    pmaddubsw            m0, m15
5554    pmaddubsw            m2, m15
5555    pmaddubsw            m1, m15
5556    pmaddubsw            m3, m15
5557    phaddw               m0, m2
5558    phaddw               m1, m3
5559    pmulhrsw             m0, m12
5560    pmulhrsw             m1, m12
5561    palignr              m2, m1, m0, 4
5562    pshufd               m4, m1, q2121
5563    punpcklwd            m3, m0, m2     ; 01 12
5564    punpckhwd            m0, m2         ; 23 34
5565    punpcklwd            m2, m1, m4     ; 45 56
5566.dy1_w2_loop:
5567    movq                 m1, [srcq+ssq*0]
5568    movhps               m1, [srcq+ssq*1]
5569    lea                srcq, [srcq+ssq*2]
5570    pmaddwd              m5, m3, m8
5571    pmaddwd              m6, m0, m9
5572    pmaddwd              m7, m2, m10
5573    mova                 m3, m0
5574    mova                 m0, m2
5575    paddd                m5, m13
5576    paddd                m6, m7
5577    pshufb               m1, m14
5578    pmaddubsw            m1, m15
5579    phaddw               m1, m1
5580    pmulhrsw             m1, m12
5581    palignr              m7, m1, m4, 12
5582    punpcklwd            m2, m7, m1     ; 67 78
5583    pmaddwd              m7, m2, m11
5584    mova                 m4, m1
5585    paddd                m5, m6
5586    paddd                m5, m7
5587    psrad                m5, rndshift
5588    packssdw             m5, m5
5589    packuswb             m5, m5
5590    movd                r4d, m5
5591    mov        [dstq+dsq*0], r4w
5592    shr                 r4d, 16
5593    mov        [dstq+dsq*1], r4w
5594    lea                dstq, [dstq+dsq*2]
5595    sub                  hd, 2
5596    jg .dy1_w2_loop
5597    RET
5598%endif
5599INIT_XMM ssse3
5600.dy1_w4:
5601%if ARCH_X86_64
5602    mov                 myd, mym
5603    movzx               t0d, t0b
5604    dec                srcq
5605    movd                m15, t0d
5606%else
5607 %define m10 [base+pd_0x3ff]
5608 %define m11 [base+pd_0x4000]
5609 %define m8  m0
5610 %xdefine m14 m4
5611 %define m15 m3
5612 %if isprep
5613  %define ssq r3
5614 %endif
5615    movzx                r4, byte [esp+0x1f0]
5616    dec                srcq
5617    movd                m15, r4
5618%endif
5619    pmaddwd              m8, [base+rescale_mul]
5620%if ARCH_X86_64
5621    mova                m11, [base+pd_0x4000]
5622%endif
5623    pshufd              m15, m15, q0000
5624    paddd               m14, m8 ; mx+dx*[0-3]
5625    pand                 m8, m14, m10
5626    psrld                m8, 6
5627    paddd               m15, m8
5628    psrldq               m7, m15, 8
5629%if ARCH_X86_64
5630    movd                r4d, m15
5631    movd               r11d, m7
5632    psrldq              m15, 4
5633    psrldq               m7, 4
5634    movd                r6d, m15
5635    movd               r13d, m7
5636    movd                m15, [base+subpel_filters+ r4*8+2]
5637    movd                 m2, [base+subpel_filters+r11*8+2]
5638    movd                 m3, [base+subpel_filters+ r6*8+2]
5639    movd                 m4, [base+subpel_filters+r13*8+2]
5640    shr                 myd, 6
5641    mov                 r4d, 64 << 24
5642    lea                 myd, [t1+myq]
5643    cmovnz              r4q, [base+subpel_filters+myq*8]
5644%else
5645    movd                 r1, m15
5646    movd                 r3, m7
5647    psrldq              m15, 4
5648    psrldq               m7, 4
5649    movd                 r4, m15
5650    movd                 r5, m7
5651 %define m15 m5
5652    SWAP                 m4, m7
5653    movd                m15, [base+subpel_filters+r1*8+2]
5654    movd                 m2, [base+subpel_filters+r3*8+2]
5655    movd                 m3, [base+subpel_filters+r4*8+2]
5656    movd                 m4, [base+subpel_filters+r5*8+2]
5657    mov                 myd, mym
5658    mov                  rX, [esp+0x1f4]
5659    xor                  r5, r5
5660    shr                 myd, 6
5661    lea                  rX, [rX+myd]
5662    mov                  r4, 64 << 24
5663    cmovnz               r4, [base+subpel_filters+rX*8+0]
5664    cmovnz               r5, [base+subpel_filters+rX*8+4]
5665    mov                  r3, r3m
5666 %if isprep
5667    lea                ss3q, [ssq*3]
5668 %endif
5669%endif
5670    punpckldq           m15, m3
5671    punpckldq            m2, m4
5672    punpcklqdq          m15, m2
5673    movq                 m6, [base+subpel_s_shuf2]
5674%if ARCH_X86_64
5675    pcmpeqd              m8, m9
5676    psrld               m14, 10
5677    pshufb              m14, [base+bdct_lb_dw]
5678    movu                 m0, [srcq+ssq*0]
5679    movu                 m1, [srcq+ssq*1]
5680    movu                 m2, [srcq+ssq*2]
5681    movu                 m3, [srcq+ss3q ]
5682    lea                srcq, [srcq+ssq*4]
5683    punpcklqdq           m6, m6
5684    movu                 m4, [srcq+ssq*0]
5685    movu                 m5, [srcq+ssq*1]
5686    movu                 m7, [srcq+ssq*2]
5687    add                srcq, ss3q
5688    pand                m11, m8
5689    pandn                m8, m15
5690    SWAP                m15, m8
5691    por                 m15, m11
5692    paddb               m14, m6
5693    movq                m10, r4q
5694    punpcklbw           m10, m10
5695    psraw               m10, 8
5696    pshufb               m0, m14
5697    pshufb               m1, m14
5698    pshufb               m2, m14
5699    pshufb               m3, m14
5700    pshufb               m4, m14
5701    pshufb               m5, m14
5702    pshufb               m7, m14
5703    pmaddubsw            m0, m15
5704    pmaddubsw            m1, m15
5705    pmaddubsw            m2, m15
5706    pmaddubsw            m3, m15
5707    pmaddubsw            m4, m15
5708    pmaddubsw            m5, m15
5709    pmaddubsw            m7, m15
5710    phaddw               m0, m1
5711    phaddw               m2, m3
5712    phaddw               m4, m5
5713    phaddw               m6, m7, m7
5714    pmulhrsw             m0, m12    ; 0 1
5715    pmulhrsw             m2, m12    ; 2 3
5716    pmulhrsw             m4, m12    ; 4 5
5717    pmulhrsw             m6, m12    ; 6 _
5718    shufps               m1, m0, m2, q1032  ; 1 2
5719    shufps               m3, m2, m4, q1032  ; 3 4
5720    shufps               m5, m4, m6, q1032  ; 5 6
5721    punpcklwd            m7, m0, m1 ; 01
5722    punpckhwd            m0, m1     ; 12
5723    punpcklwd            m8, m2, m3 ; 23
5724    punpckhwd            m2, m3     ; 34
5725    punpcklwd            m9, m4, m5 ; 45
5726    punpckhwd            m4, m5     ; 56
5727%else
5728    pxor                 m3, m3
5729    pcmpeqd              m8, m3
5730    psrld               m14, 10
5731    pshufb              m14, [base+bdct_lb_dw]
5732    movu                 m1, [srcq+ssq*0]
5733    movu                 m2, [srcq+ssq*1]
5734    movu                 m3, [srcq+ssq*2]
5735    add                srcq, ss3q
5736    punpcklqdq           m6, m6
5737    SWAP                 m4, m7
5738    pand                 m7, m11, m8
5739    pandn                m8, m15
5740    SWAP                 m5, m0
5741    por                 m15, m7
5742    paddb               m14, m6
5743    movu                 m0, [srcq+ssq*0]
5744    movu                 m7, [srcq+ssq*1]
5745    movu                 m6, [srcq+ssq*2]
5746    pshufb               m1, m14
5747    pshufb               m2, m14
5748    pshufb               m3, m14
5749    pshufb               m0, m14
5750    pshufb               m7, m14
5751    pshufb               m6, m14
5752    pmaddubsw            m1, m15
5753    pmaddubsw            m2, m15
5754    pmaddubsw            m3, m15
5755    mova         [esp+0x00], m14
5756    mova         [esp+0x10], m15
5757    pmaddubsw            m0, m15
5758    pmaddubsw            m7, m15
5759    pmaddubsw            m6, m15
5760    phaddw               m1, m2
5761    movu                 m2, [srcq+ss3q ]
5762    lea                srcq, [srcq+ssq*4]
5763    mov                  r0, r0m
5764    phaddw               m3, m0
5765    pshufb               m2, m14
5766    pmaddubsw            m2, m15
5767 %define m14 [esp+0x00]
5768 %define m15 [esp+0x10]
5769    phaddw               m7, m6
5770    phaddw               m2, m2
5771    movd                 m6, r4
5772    movd                 m0, r5
5773    punpckldq            m6, m0
5774    punpcklbw            m6, m6
5775    psraw                m6, 8
5776    mova         [esp+0x20], m6
5777    pmulhrsw             m1, m12 ; 0 1
5778    pmulhrsw             m3, m12 ; 2 3
5779    pmulhrsw             m7, m12 ; 4 5
5780    pmulhrsw             m2, m12 ; 6 _
5781    shufps               m0, m1, m3, q1032  ; 1 2
5782    shufps               m4, m3, m7, q1032  ; 3 4
5783    shufps               m5, m7, m2, q1032  ; 5 6
5784    punpcklwd            m6, m1, m0 ; 01
5785    punpckhwd            m1, m0     ; 12
5786    mova         [esp+0x30], m1
5787    punpcklwd            m1, m3, m4 ; 23
5788    punpckhwd            m3, m4     ; 34
5789    mova         [esp+0x40], m3
5790    punpcklwd            m3, m7, m5 ; 45
5791    punpckhwd            m7, m5     ; 56
5792    mova         [esp+0x50], m7
5793    mova         [esp+0x60], m2
5794    mova                 m0, [esp+0x20]
5795 %xdefine m8 m1
5796 %xdefine m9 m3
5797 %xdefine m10 m0
5798    SWAP                 m7, m6
5799    SWAP                 m1, m4
5800    SWAP                 m3, m2
5801%endif
5802    pshufd               m1, m10, q0000
5803    pshufd               m3, m10, q1111
5804    pshufd               m5, m10, q2222
5805    pshufd              m10, m10, q3333
5806%if ARCH_X86_64
5807    mova         [rsp+0x00], m8
5808    mova         [rsp+0x10], m2
5809    mova         [rsp+0x20], m9
5810    mova         [rsp+0x30], m4
5811%else
5812    mova         [esp+0x70], m8
5813    mova         [esp+0x80], m9
5814    mova         [esp+0x90], m1
5815    mova         [esp+0xa0], m3
5816    mova         [esp+0xb0], m5
5817    mova         [esp+0xc0], m10
5818 %ifidn %1, put
5819    mov                 dsd, dsm
5820 %endif
5821 %define m11 m6
5822%endif
5823.dy1_w4_loop:
5824%if ARCH_X86_64
5825    movu                m11, [srcq+ssq*0]
5826    pmaddwd              m7, m1
5827    pmaddwd              m8, m3
5828    pmaddwd              m0, m1
5829    pmaddwd              m2, m3
5830    pmaddwd              m9, m5
5831    pmaddwd              m4, m5
5832    paddd                m7, m8
5833    paddd                m0, m2
5834    movu                 m8, [srcq+ssq*1]
5835    lea                srcq, [srcq+ssq*2]
5836    pshufb              m11, m14
5837    pmaddubsw           m11, m15
5838    paddd                m7, m13
5839    paddd                m0, m13
5840    paddd                m7, m9
5841    paddd                m0, m4
5842    pshufb               m8, m14
5843    pmaddubsw            m8, m15
5844    phaddw              m11, m8
5845    mova                 m8, [rsp+0x20]
5846    pmulhrsw            m11, m12
5847    punpcklwd            m9, m6, m11    ; 67
5848    psrldq               m6, m11, 8
5849    punpcklwd            m4, m11, m6    ; 78
5850    pmaddwd              m2, m9, m10
5851    pmaddwd             m11, m4, m10
5852    paddd                m7, m2
5853    mova                 m2, [rsp+0x30]
5854    paddd                m0, m11
5855%else
5856    SWAP                 m7, m6
5857    SWAP                 m1, m4
5858    SWAP                 m3, m2
5859    movu                 m5, [srcq+ssq*0]
5860    mova                 m0, [esp+0x30]
5861    mova                 m2, [esp+0x40]
5862    mova                 m4, [esp+0x50]
5863    pmaddwd              m6, [esp+0x90]
5864    pmaddwd              m1, [esp+0xa0]
5865    pmaddwd              m0, [esp+0x90]
5866    pmaddwd              m2, [esp+0xa0]
5867    pmaddwd              m3, [esp+0xb0]
5868    pmaddwd              m4, [esp+0xb0]
5869    paddd                m6, m1
5870    paddd                m0, m2
5871    movu                 m7, [srcq+ssq*1]
5872    lea                srcq, [srcq+ssq*2]
5873    pshufb               m5, m14
5874    pmaddubsw            m5, m15
5875    paddd                m6, m13
5876    paddd                m0, m13
5877    paddd                m6, m3
5878    paddd                m0, m4
5879    pshufb               m7, m14
5880    pmaddubsw            m7, m15
5881    phaddw               m5, m7
5882    mova                 m7, [rsp+0x80]
5883    pmulhrsw             m5, m12
5884    punpcklwd            m3, [esp+0x60], m5 ; 67
5885    psrldq               m1, m5, 8
5886    punpcklwd            m4, m5, m1         ; 78
5887    pmaddwd              m2, m3, [esp+0xc0]
5888    pmaddwd              m5, m4, [esp+0xc0]
5889    mova         [esp+0x60], m1
5890    paddd                m6, m2
5891    mova                 m2, [esp+0x50]
5892    paddd                m0, m5
5893    SWAP                 m7, m6
5894%endif
5895    psrad                m7, rndshift
5896    psrad                m0, rndshift
5897    packssdw             m7, m0
5898%if ARCH_X86_64
5899    mova                 m0, [rsp+0x10]
5900%else
5901    mova                 m0, [esp+0x40]
5902%define m11 m5
5903%endif
5904%ifidn %1, put
5905    packuswb             m7, m7
5906    psrldq              m11, m7, 4
5907    movd       [dstq+dsq*0], m7
5908    movd       [dstq+dsq*1], m11
5909    lea                dstq, [dstq+dsq*2]
5910%else
5911    mova             [tmpq], m7
5912    add                tmpq, 16
5913%endif
5914    sub                  hd, 2
5915    jz .ret
5916%if ARCH_X86_64
5917    mova                 m7, [rsp+0x00]
5918    mova         [rsp+0x00], m8
5919    mova         [rsp+0x10], m2
5920    mova         [rsp+0x20], m9
5921    mova         [rsp+0x30], m4
5922%else
5923    mova                 m7, [esp+0x70] ; 01
5924    mova                 m1, [esp+0x80] ; 23
5925    mova                 m2, [esp+0x50] ; 34
5926    mova         [esp+0x30], m0
5927    mova         [esp+0x70], m1
5928    mova         [esp+0x40], m2
5929    mova         [esp+0x80], m3
5930    mova         [esp+0x50], m4
5931%endif
5932    jmp .dy1_w4_loop
5933INIT_XMM ssse3
5934.dy1_w8:
5935    mov    dword [rsp+0x90], 1
5936    movifprep   tmp_stridem, 16
5937    jmp .dy1_w_start
5938.dy1_w16:
5939    mov    dword [rsp+0x90], 2
5940    movifprep   tmp_stridem, 32
5941    jmp .dy1_w_start
5942.dy1_w32:
5943    mov    dword [rsp+0x90], 4
5944    movifprep   tmp_stridem, 64
5945    jmp .dy1_w_start
5946.dy1_w64:
5947    mov    dword [rsp+0x90], 8
5948    movifprep   tmp_stridem, 128
5949    jmp .dy1_w_start
5950.dy1_w128:
5951    mov    dword [rsp+0x90], 16
5952    movifprep   tmp_stridem, 256
5953.dy1_w_start:
5954    mov                 myd, mym
5955%ifidn %1, put
5956    movifnidn           dsm, dsq
5957%endif
5958%if ARCH_X86_64
5959    shr                 t0d, 16
5960    sub                srcq, 3
5961    shr                 myd, 6
5962    mov                 r4d, 64 << 24
5963    lea                 myd, [t1+myq]
5964    cmovnz              r4q, [base+subpel_filters+myq*8]
5965    movd                m15, t0d
5966%else
5967 %define m8   m0
5968 %define m9   m1
5969 %xdefine m14 m4
5970 %xdefine m15 m3
5971 %if isprep
5972  %define ssq ssm
5973 %endif
5974    mov                  r5, [esp+0x1f0]
5975    mov                  r3, [esp+0x1f4]
5976    shr                  r5, 16
5977    sub                srcq, 3
5978    movd                m15, r5
5979    xor                  r5, r5
5980    shr                 myd, 6
5981    lea                  r3, [r3+myd]
5982    mov                  r4, 64 << 24
5983    cmovnz               r4, [base+subpel_filters+r3*8+0]
5984    cmovnz               r5, [base+subpel_filters+r3*8+4]
5985    mov                  r0, r0m
5986    mov                  r3, r3m
5987%endif
5988    pslld                m7, m8, 2 ; dx*4
5989    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
5990    pshufd              m15, m15, q0000
5991    paddd               m14, m8 ; mx+dx*[0-3]
5992%if ARCH_X86_64
5993    movq                 m3, r4q
5994    punpcklbw            m3, m3
5995    psraw                m3, 8
5996%else
5997    movd                 m5, r4
5998    movd                 m6, r5
5999    punpckldq            m5, m6
6000    punpcklbw            m5, m5
6001    psraw                m5, 8
6002    SWAP                 m3, m5
6003%endif
6004    mova        [rsp+0x100], m7
6005    mova        [rsp+0x120], m15
6006    mov         [rsp+0x098], srcq
6007    mov         [rsp+0x130], r0q ; dstq / tmpq
6008    pshufd               m0, m3, q0000
6009    pshufd               m1, m3, q1111
6010    pshufd               m2, m3, q2222
6011    pshufd               m3, m3, q3333
6012    mova        [rsp+0x140], m0
6013    mova        [rsp+0x150], m1
6014    mova        [rsp+0x160], m2
6015    mova        [rsp+0x170], m3
6016%if ARCH_X86_64 && UNIX64
6017    mov                  hm, hd
6018%elif ARCH_X86_32
6019    SWAP                  m5, m3
6020    mov                   r5, hm
6021    mov          [esp+0x134], r5
6022%endif
6023    jmp .dy1_hloop
6024.dy1_hloop_prep:
6025    dec   dword [rsp+0x090]
6026    jz .ret
6027%if ARCH_X86_64
6028    add   qword [rsp+0x130], 8*(isprep+1)
6029    mov                  hd, hm
6030%else
6031    add   dword [rsp+0x130], 8*(isprep+1)
6032    mov                  r5, [esp+0x134]
6033    mov                  r0, [esp+0x130]
6034%endif
6035    mova                 m7, [rsp+0x100]
6036    mova                m14, [rsp+0x110]
6037%if ARCH_X86_64
6038    mova                m10, [base+pd_0x3ff]
6039%else
6040 %define m10 [base+pd_0x3ff]
6041%endif
6042    mova                m15, [rsp+0x120]
6043    mov                srcq, [rsp+0x098]
6044%if ARCH_X86_64
6045    mov                 r0q, [rsp+0x130] ; dstq / tmpq
6046%else
6047    mov                  hm, r5
6048    mov                 r0m, r0
6049    mov                  r3, r3m
6050%endif
6051    paddd               m14, m7
6052.dy1_hloop:
6053    pxor                 m9, m9
6054%if ARCH_X86_64
6055    mova                m11, [base+pq_0x40000000]
6056%else
6057 %define m11 [base+pq_0x40000000]
6058%endif
6059    psrld                m2, m14, 10
6060    mova              [rsp], m2
6061    pand                 m6, m14, m10
6062    psrld                m6, 6
6063    paddd                m5, m15, m6
6064    pcmpeqd              m6, m9
6065    psrldq               m2, m5, 8
6066%if ARCH_X86_64
6067    movd                r4d, m5
6068    movd                r6d, m2
6069    psrldq               m5, 4
6070    psrldq               m2, 4
6071    movd                r7d, m5
6072    movd                r9d, m2
6073    movq                 m0, [base+subpel_filters+r4*8]
6074    movq                 m1, [base+subpel_filters+r6*8]
6075    movhps               m0, [base+subpel_filters+r7*8]
6076    movhps               m1, [base+subpel_filters+r9*8]
6077%else
6078    movd                 r0, m5
6079    movd                 rX, m2
6080    psrldq               m5, 4
6081    psrldq               m2, 4
6082    movd                 r4, m5
6083    movd                 r5, m2
6084    movq                 m0, [base+subpel_filters+r0*8]
6085    movq                 m1, [base+subpel_filters+rX*8]
6086    movhps               m0, [base+subpel_filters+r4*8]
6087    movhps               m1, [base+subpel_filters+r5*8]
6088    pxor                 m2, m2
6089 %define m9 m2
6090%endif
6091    paddd               m14, m7 ; mx+dx*[4-7]
6092    pand                 m5, m14, m10
6093    psrld                m5, 6
6094    paddd               m15, m5
6095    pcmpeqd              m5, m9
6096    mova        [rsp+0x110], m14
6097    psrldq               m4, m15, 8
6098%if ARCH_X86_64
6099    movd               r10d, m15
6100    movd               r11d, m4
6101    psrldq              m15, 4
6102    psrldq               m4, 4
6103    movd               r13d, m15
6104    movd                rXd, m4
6105    movq                 m2, [base+subpel_filters+r10*8]
6106    movq                 m3, [base+subpel_filters+r11*8]
6107    movhps               m2, [base+subpel_filters+r13*8]
6108    movhps               m3, [base+subpel_filters+ rX*8]
6109    psrld               m14, 10
6110    psrldq               m4, m14, 8
6111    movd               r10d, m14
6112    movd               r11d, m4
6113    psrldq              m14, 4
6114    psrldq               m4, 4
6115    movd               r13d, m14
6116    movd                rXd, m4
6117    mov                 r4d, [rsp+ 0]
6118    mov                 r6d, [rsp+ 8]
6119    mov                 r7d, [rsp+ 4]
6120    mov                 r9d, [rsp+12]
6121    pshufd               m4, m6, q1100
6122    pshufd               m6, m6, q3322
6123    pshufd               m7, m5, q1100
6124    pshufd               m5, m5, q3322
6125    pand                 m8, m11, m4
6126    pand                 m9, m11, m6
6127    pand                m15, m11, m7
6128    pand                m11, m11, m5
6129    pandn                m4, m0
6130    pandn                m6, m1
6131    pandn                m7, m2
6132    pandn                m5, m3
6133    por                  m8, m4
6134    por                  m9, m6
6135    por                 m15, m7
6136    por                 m11, m5
6137    mova         [rsp+0x10], m8
6138    mova         [rsp+0x20], m9
6139    mova         [rsp+0x30], m15
6140    mova         [rsp+0x40], m11
6141    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
6142    mova         [rsp+0x50], m1
6143    mova         [rsp+0x60], m2
6144    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
6145    mova         [rsp+0x70], m3
6146    mova         [rsp+0x80], m4
6147    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
6148    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
6149    SWAP                 m7, m0
6150    SWAP                 m8, m14
6151    mova                 m1, [rsp+0x50]
6152    mova                 m2, [rsp+0x60]
6153    mova                 m3, [rsp+0x70]
6154    mova                m15, [rsp+0x80]
6155    punpcklwd            m4, m5, m6 ; 45a
6156    punpckhwd            m5, m6     ; 45b
6157    punpcklwd            m6, m7, m8 ; 67a
6158    punpckhwd            m7, m8     ; 67b
6159    SWAP                m14, m8
6160    mova                 m8, [rsp+0x140]
6161    mova                 m9, [rsp+0x150]
6162    mova                m10, [rsp+0x160]
6163    mova                m11, [rsp+0x170]
6164    punpcklwd            m0, m1, m2 ; 01a
6165    punpckhwd            m1, m2     ; 01b
6166    punpcklwd            m2, m3, m15; 23a
6167    punpckhwd            m3, m15    ; 23b
6168    mova         [rsp+0x50], m4
6169    mova         [rsp+0x60], m5
6170    mova         [rsp+0x70], m6
6171    mova         [rsp+0x80], m7
6172    mova                m14, [base+unpckw]
6173%else
6174    movd                 r0, m15
6175    movd                 rX, m4
6176    psrldq              m15, 4
6177    psrldq               m4, 4
6178    movd                 r4, m15
6179    movd                 r5, m4
6180    mova                m14, [esp+0x110]
6181    movq                 m2, [base+subpel_filters+r0*8]
6182    movq                 m3, [base+subpel_filters+rX*8]
6183    movhps               m2, [base+subpel_filters+r4*8]
6184    movhps               m3, [base+subpel_filters+r5*8]
6185    psrld               m14, 10
6186    mova           [esp+16], m14
6187    mov                  r0, [esp+ 0]
6188    mov                  rX, [esp+ 8]
6189    mov                  r4, [esp+ 4]
6190    mov                  r5, [esp+12]
6191    mova         [esp+0x20], m0
6192    mova         [esp+0x30], m1
6193    mova         [esp+0x40], m2
6194    mova         [esp+0x50], m3
6195    pshufd               m4, m6, q1100
6196    pshufd               m6, m6, q3322
6197    pshufd               m7, m5, q1100
6198    pshufd               m5, m5, q3322
6199    pand                 m0, m11, m4
6200    pand                 m1, m11, m6
6201    pand                 m2, m11, m7
6202    pand                 m3, m11, m5
6203    pandn                m4, [esp+0x20]
6204    pandn                m6, [esp+0x30]
6205    pandn                m7, [esp+0x40]
6206    pandn                m5, [esp+0x50]
6207    por                  m0, m4
6208    por                  m1, m6
6209    por                  m2, m7
6210    por                  m3, m5
6211    mova        [esp+0x20], m0
6212    mova        [esp+0x30], m1
6213    mova        [esp+0x40], m2
6214    mova        [esp+0x50], m3
6215    MC_8TAP_SCALED_H   0x20, 0x60, 0 ; 0-1
6216    MC_8TAP_SCALED_H   0x20, 0x180   ; 2-3
6217    MC_8TAP_SCALED_H   0x20, 0x1a0   ; 4-5
6218    MC_8TAP_SCALED_H   0x20, 0x1c0   ; 6-7
6219    mova                 m5, [esp+0x1a0]
6220    mova                 m6, [esp+0x1b0]
6221    mova                 m7, [esp+0x1c0]
6222    mova                 m0, [esp+0x1d0]
6223    punpcklwd            m4, m5, m6      ; 45a
6224    punpckhwd            m5, m6          ; 45b
6225    punpcklwd            m6, m7, m0      ; 67a
6226    punpckhwd            m7, m0          ; 67b
6227    mova        [esp+0x1a0], m4
6228    mova        [esp+0x1b0], m5
6229    mova        [esp+0x1c0], m6
6230    mova        [esp+0x1d0], m7
6231    mova                 m1, [esp+0x060]
6232    mova                 m2, [esp+0x070]
6233    mova                 m3, [esp+0x180]
6234    mova                 m4, [esp+0x190]
6235    punpcklwd            m0, m1, m2      ; 01a
6236    punpckhwd            m1, m2          ; 01b
6237    punpcklwd            m2, m3, m4      ; 23a
6238    punpckhwd            m3, m4          ; 23b
6239    mova        [esp+0x060], m0
6240    mova        [esp+0x070], m1
6241    mova        [esp+0x180], m2
6242    mova        [esp+0x190], m3
6243 %define m8  [esp+0x140]
6244 %define m9  [esp+0x150]
6245 %define m10 [esp+0x160]
6246 %define m11 [esp+0x170]
6247%endif
6248.dy1_vloop:
6249%if ARCH_X86_32
6250    mov                  r0, r0m
6251%endif
6252    pmaddwd              m4, m0, m8
6253    pmaddwd              m5, m1, m8
6254    pmaddwd              m6, m2, m9
6255    pmaddwd              m7, m3, m9
6256    paddd                m4, m13
6257    paddd                m5, m13
6258    paddd                m4, m6
6259    paddd                m5, m7
6260%if ARCH_X86_64
6261    pmaddwd              m6, [rsp+0x50], m10
6262    pmaddwd              m7, [rsp+0x60], m10
6263%else
6264    pmaddwd              m6, [rsp+0x1a0], m10
6265    pmaddwd              m7, [rsp+0x1b0], m10
6266%endif
6267    paddd                m4, m6
6268    paddd                m5, m7
6269%if ARCH_X86_64
6270    pmaddwd              m6, [rsp+0x70], m11
6271    pmaddwd              m7, [rsp+0x80], m11
6272%else
6273    pmaddwd              m6, [rsp+0x1c0], m11
6274    pmaddwd              m7, [rsp+0x1d0], m11
6275%endif
6276    paddd                m4, m6
6277    paddd                m5, m7
6278    psrad                m4, rndshift
6279    psrad                m5, rndshift
6280    packssdw             m4, m5
6281%ifidn %1, put
6282    packuswb             m4, m4
6283    movq             [dstq], m4
6284    add                dstq, dsm
6285%else
6286    mova             [tmpq], m4
6287    add                tmpq, tmp_stridem
6288%endif
6289%if ARCH_X86_32
6290    mov                 r0m, r0
6291%endif
6292    dec                  hd
6293    jz .dy1_hloop_prep
6294%if ARCH_X86_64
6295    movq                 m4, [srcq+ r4]
6296    movq                 m5, [srcq+ r6]
6297    movhps               m4, [srcq+ r7]
6298    movhps               m5, [srcq+ r9]
6299    movq                 m6, [srcq+r10]
6300    movq                 m7, [srcq+r11]
6301    movhps               m6, [srcq+r13]
6302    movhps               m7, [srcq+ rX]
6303    add                srcq, ssq
6304    pshufd              m15, m14, q1032
6305    pshufb               m0, m14                ; 0a 1a
6306    pshufb               m1, m14                ; 0b 1b
6307    pshufb               m2, m15                ; 3a 2a
6308    pshufb               m3, m15                ; 3b 2b
6309    pmaddubsw            m4, [rsp+0x10]
6310    pmaddubsw            m5, [rsp+0x20]
6311    pmaddubsw            m6, [rsp+0x30]
6312    pmaddubsw            m7, [rsp+0x40]
6313    phaddw               m4, m5
6314    phaddw               m6, m7
6315    phaddw               m4, m6
6316    pmulhrsw             m4, m12
6317    pshufb               m5, [rsp+0x70], m15    ; 7a 6a
6318    pshufb               m7, [rsp+0x80], m15    ; 7b 6b
6319    pshufb               m6, [rsp+0x50], m14    ; 4a 5a
6320    pshufb              m15, [rsp+0x60], m14    ; 4b 5b
6321    punpckhwd            m0, m2  ; 12a
6322    punpckhwd            m1, m3  ; 12b
6323    punpcklwd            m2, m6  ; 34a
6324    punpcklwd            m3, m15 ; 34b
6325    punpckhwd            m6, m5  ; 56a
6326    punpckhwd           m15, m7  ; 56b
6327    punpcklwd            m5, m4  ; 78a
6328    psrldq               m4, 8
6329    punpcklwd            m7, m4  ; 78b
6330    mova         [rsp+0x50], m6
6331    mova         [rsp+0x60], m15
6332    mova         [rsp+0x70], m5
6333    mova         [rsp+0x80], m7
6334%else
6335    mov                  r0, [esp+ 0]
6336    mov                  rX, [esp+ 8]
6337    mov                  r4, [esp+ 4]
6338    mov                  r5, [esp+12]
6339    mova                 m6, [base+unpckw]
6340    mova                 m0, [esp+0x060]
6341    mova                 m1, [esp+0x070]
6342    mova                 m7, [esp+0x1a0]
6343    movq                 m4, [srcq+r0]
6344    movq                 m5, [srcq+rX]
6345    movhps               m4, [srcq+r4]
6346    movhps               m5, [srcq+r5]
6347    pshufb               m0, m6         ; 0a 1a
6348    pshufb               m1, m6         ; 0b 1b
6349    pshufb               m7, m6         ; 4a 5a
6350    mov                  r0, [esp+16]
6351    mov                  rX, [esp+24]
6352    mov                  r4, [esp+20]
6353    mov                  r5, [esp+28]
6354    movq                 m3, [srcq+r0]
6355    movq                 m2, [srcq+rX]
6356    movhps               m3, [srcq+r4]
6357    movhps               m2, [srcq+r5]
6358    add                srcq, ssq
6359    pmaddubsw            m4, [esp+0x20]
6360    pmaddubsw            m5, [esp+0x30]
6361    pmaddubsw            m3, [esp+0x40]
6362    pmaddubsw            m2, [esp+0x50]
6363    phaddw               m4, m5
6364    phaddw               m3, m2
6365    mova                 m5, [esp+0x1b0]
6366    mova                 m2, [esp+0x180]
6367    phaddw               m4, m3
6368    mova                 m3, [esp+0x190]
6369    pmulhrsw             m4, m12        ; 8a 8b
6370    pshufb               m5, m6         ; 4b 5b
6371    pshufd               m6, m6, q1032
6372    pshufb               m2, m6         ; 3a 2a
6373    pshufb               m3, m6         ; 3b 2b
6374    punpckhwd            m0, m2         ; 12a
6375    punpckhwd            m1, m3         ; 12b
6376    mova         [esp+0x60], m0
6377    mova         [esp+0x70], m1
6378    mova                 m0, [esp+0x1c0]
6379    mova                 m1, [esp+0x1d0]
6380    punpcklwd            m2, m7         ; 34a
6381    punpcklwd            m3, m5         ; 34b
6382    mova        [esp+0x180], m2
6383    mova        [esp+0x190], m3
6384    pshufb               m0, m6         ; 7a 6a
6385    pshufb               m1, m6         ; 7b 6b
6386    punpckhwd            m7, m0         ; 56a
6387    punpckhwd            m5, m1         ; 56b
6388    punpcklwd            m0, m4
6389    punpckhqdq           m4, m4
6390    punpcklwd            m1, m4
6391    mova        [esp+0x1a0], m7
6392    mova        [esp+0x1b0], m5
6393    mova        [esp+0x1c0], m0
6394    mova        [esp+0x1d0], m1
6395    mova                 m0, [esp+0x60]
6396    mova                 m1, [esp+0x70]
6397%endif
6398    jmp .dy1_vloop
6399INIT_XMM ssse3
6400.dy2:
6401    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
6402    add                  wq, base_reg
6403    jmp                  wq
6404%ifidn %1, put
6405.dy2_w2:
6406 %if ARCH_X86_64
6407    mov                 myd, mym
6408    movzx               t0d, t0b
6409    dec                srcq
6410    movd                m15, t0d
6411 %else
6412  %define m10 [base+pd_0x3ff]
6413  %define m11 [base+pd_0x4000]
6414  %define m8  m0
6415  %define m9  m1
6416  %define m14 m4
6417  %define m15 m3
6418    movzx                r5, byte [esp+0x1f0]
6419    dec                srcd
6420    movd                m15, r5
6421 %endif
6422    punpckldq            m9, m8
6423    SWAP                 m8, m9
6424    paddd               m14, m8 ; mx+dx*[0-1]
6425 %if ARCH_X86_64
6426    mova                m11, [base+pd_0x4000]
6427 %endif
6428    pshufd              m15, m15, q0000
6429    pand                 m8, m14, m10
6430    psrld                m8, 6
6431    paddd               m15, m8
6432    movd                r4d, m15
6433    psrldq              m15, 4
6434 %if ARCH_X86_64
6435    movd                r6d, m15
6436 %else
6437    movd                r3d, m15
6438 %endif
6439    mova                 m5, [base+bdct_lb_dw]
6440    mova                 m6, [base+subpel_s_shuf2]
6441    movd                m15, [base+subpel_filters+r4*8+2]
6442 %if ARCH_X86_64
6443    movd                 m7, [base+subpel_filters+r6*8+2]
6444 %else
6445    movd                 m7, [base+subpel_filters+r3*8+2]
6446 %endif
6447    pxor                 m9, m9
6448    pcmpeqd              m8, m9
6449    psrld               m14, 10
6450 %if ARCH_X86_32
6451    mov                  r3, r3m
6452    pshufb              m14, m5
6453    paddb               m14, m6
6454    mova         [esp+0x00], m14
6455  %define m14 [esp+0x00]
6456    SWAP                 m5, m0
6457    SWAP                 m6, m3
6458  %define m8  m5
6459  %define m15 m6
6460 %endif
6461    movq                 m0, [srcq+ssq*0]
6462    movq                 m1, [srcq+ssq*1]
6463    movhps               m0, [srcq+ssq*2]
6464    movhps               m1, [srcq+ss3q ]
6465    lea                srcq, [srcq+ssq*4]
6466 %if ARCH_X86_64
6467    shr                 myd, 6
6468    mov                 r4d, 64 << 24
6469    lea                 myd, [t1+myq]
6470    cmovnz              r4q, [base+subpel_filters+myq*8]
6471    pshufb              m14, m5
6472    paddb               m14, m6
6473    movq                m10, r4q
6474 %else
6475    mov                 myd, mym
6476    mov                  r3, [esp+0x1f4]
6477    xor                  r5, r5
6478    shr                 myd, 6
6479    lea                  r3, [r3+myd]
6480    mov                  r4, 64 << 24
6481    cmovnz               r4, [base+subpel_filters+r3*8+0]
6482    cmovnz               r5, [base+subpel_filters+r3*8+4]
6483    mov                  r3, r3m
6484  %define m10 m4
6485    movd                m10, r4
6486    movd                 m3, r5
6487    punpckldq           m10, m3
6488 %endif
6489    movq                 m3, [srcq+ssq*0]
6490    movhps               m3, [srcq+ssq*1]
6491    lea                srcq, [srcq+ssq*2]
6492    punpcklbw           m10, m10
6493    psraw               m10, 8
6494    punpckldq           m15, m7
6495    punpcklqdq          m15, m15
6496 %if ARCH_X86_64
6497    pand                m11, m8
6498 %else
6499    pand                 m7, m11, m8
6500  %define m11 m7
6501 %endif
6502    pandn                m8, m15
6503    SWAP                m15, m8
6504    por                 m15, m11
6505 %if ARCH_X86_64
6506    pshufd               m8, m10, q0000
6507    pshufd               m9, m10, q1111
6508    pshufd              m11, m10, q3333
6509    pshufd              m10, m10, q2222
6510 %else
6511    mova         [esp+0x10], m15
6512  %define m15 [esp+0x10]
6513    mov                  r5, r0m
6514  %define dstq r5
6515    mov                 dsd, dsm
6516    pshufd               m5, m4, q0000
6517    pshufd               m6, m4, q1111
6518    pshufd               m7, m4, q2222
6519    pshufd               m4, m4, q3333
6520  %define m8  [esp+0x20]
6521  %define m9  [esp+0x30]
6522  %define m10 [esp+0x40]
6523  %define m11 [esp+0x50]
6524    mova                 m8, m5
6525    mova                 m9, m6
6526    mova                m10, m7
6527    mova                m11, m4
6528 %endif
6529    pshufb               m0, m14
6530    pshufb               m1, m14
6531    pshufb               m3, m14
6532    pmaddubsw            m0, m15
6533    pmaddubsw            m1, m15
6534    pmaddubsw            m3, m15
6535    pslldq               m2, m3, 8
6536    phaddw               m0, m2
6537    phaddw               m1, m3
6538    pmulhrsw             m0, m12            ; 0 2 _ 4
6539    pmulhrsw             m1, m12            ; 1 3 _ 5
6540    pshufd               m2, m0, q3110      ; 0 2 2 4
6541    pshufd               m1, m1, q3110      ; 1 3 3 5
6542    punpcklwd            m3, m2, m1         ; 01 23
6543    punpckhwd            m2, m1             ; 23 45
6544.dy2_w2_loop:
6545    movq                 m6, [srcq+ssq*0]
6546    movq                 m7, [srcq+ssq*1]
6547    movhps               m6, [srcq+ssq*2]
6548    movhps               m7, [srcq+ss3q ]
6549    lea                srcq, [srcq+ssq*4]
6550    pmaddwd              m4, m3, m8
6551    pmaddwd              m5, m2, m9
6552    pshufb               m6, m14
6553    pshufb               m7, m14
6554    pmaddubsw            m6, m15
6555    pmaddubsw            m7, m15
6556    phaddw               m6, m7
6557    pmulhrsw             m6, m12
6558    psrldq               m7, m6, 8
6559    palignr              m6, m0, 8
6560    palignr              m7, m1, 8
6561    mova                 m0, m6
6562    mova                 m1, m7
6563    pshufd               m6, m6, q3221
6564    pshufd               m7, m7, q3221
6565    punpcklwd            m3, m6, m7       ; 45 67
6566    punpckhwd            m2, m6, m7       ; 67 89
6567    pmaddwd              m6, m3, m10
6568    pmaddwd              m7, m2, m11
6569    paddd                m4, m5
6570    paddd                m4, m13
6571    paddd                m6, m7
6572    paddd                m4, m6
6573    psrad                m4, rndshift
6574    packssdw             m4, m4
6575    packuswb             m4, m4
6576    movd                r4d, m4
6577    mov        [dstq+dsq*0], r4w
6578    shr                 r4d, 16
6579    mov        [dstq+dsq*1], r4w
6580    lea                dstq, [dstq+dsq*2]
6581    sub                  hd, 2
6582    jg .dy2_w2_loop
6583    RET
6584%endif
6585INIT_XMM ssse3
6586.dy2_w4:
6587%if ARCH_X86_64
6588    mov                 myd, mym
6589    movzx               t0d, t0b
6590    dec                srcq
6591    movd                m15, t0d
6592%else
6593 %define m10 [base+pd_0x3ff]
6594 %define m11 [base+pd_0x4000]
6595 %define m8  m0
6596 %xdefine m14 m4
6597 %define m15 m3
6598 %define dstq r0
6599 %if isprep
6600  %define ssq r3
6601 %endif
6602    movzx                r4, byte [esp+0x1f0]
6603    dec                srcq
6604    movd                m15, r4
6605%endif
6606    pmaddwd              m8, [base+rescale_mul]
6607%if ARCH_X86_64
6608    mova                m11, [base+pd_0x4000]
6609%endif
6610    pshufd              m15, m15, q0000
6611    paddd               m14, m8 ; mx+dx*[0-3]
6612    pand                 m8, m14, m10
6613    psrld                m8, 6
6614    paddd               m15, m8
6615    psrldq               m7, m15, 8
6616%if ARCH_X86_64
6617    movd                r4d, m15
6618    movd               r11d, m7
6619    psrldq              m15, 4
6620    psrldq               m7, 4
6621    movd                r6d, m15
6622    movd               r13d, m7
6623    movd                m15, [base+subpel_filters+ r4*8+2]
6624    movd                 m2, [base+subpel_filters+r11*8+2]
6625    movd                 m3, [base+subpel_filters+ r6*8+2]
6626    movd                 m4, [base+subpel_filters+r13*8+2]
6627    movq                 m6, [base+subpel_s_shuf2]
6628    shr                 myd, 6
6629    mov                 r4d, 64 << 24
6630    lea                 myd, [t1+myq]
6631    cmovnz              r4q, [base+subpel_filters+myq*8]
6632%else
6633    movd                 r1, m15
6634    movd                 r3, m7
6635    psrldq              m15, 4
6636    psrldq               m7, 4
6637    movd                 r4, m15
6638    movd                 r5, m7
6639 %define m15 m5
6640    SWAP                 m4, m7
6641    movd                m15, [base+subpel_filters+r1*8+2]
6642    movd                 m2, [base+subpel_filters+r3*8+2]
6643    movd                 m3, [base+subpel_filters+r4*8+2]
6644    movd                 m4, [base+subpel_filters+r5*8+2]
6645    movq                 m6, [base+subpel_s_shuf2]
6646    mov                 myd, mym
6647    mov                  r3, [esp+0x1f4]
6648    xor                  r5, r5
6649    shr                 myd, 6
6650    lea                  r3, [r3+myd]
6651    mov                  r4, 64 << 24
6652    cmovnz               r4, [base+subpel_filters+r3*8+0]
6653    cmovnz               r5, [base+subpel_filters+r3*8+4]
6654    mov                  r3, r3m
6655 %if isprep
6656    lea                ss3q, [ssq*3]
6657 %endif
6658%endif
6659    punpckldq           m15, m3
6660    punpckldq            m2, m4
6661    punpcklqdq          m15, m2
6662%if ARCH_X86_64
6663    pcmpeqd              m8, m9
6664    psrld               m14, 10
6665    movu                 m0, [srcq+ssq*0]
6666    movu                 m2, [srcq+ssq*2]
6667    movu                 m1, [srcq+ssq*1]
6668    movu                 m3, [srcq+ss3q ]
6669    lea                srcq, [srcq+ssq*4]
6670    punpcklqdq           m6, m6
6671    pshufb              m14, [base+bdct_lb_dw]
6672    movu                 m4, [srcq+ssq*0]
6673    movu                 m5, [srcq+ssq*1]
6674    lea                srcq, [srcq+ssq*2]
6675    pand                m11, m8
6676    pandn                m8, m15
6677    SWAP                m15, m8
6678    por                 m15, m11
6679    paddb               m14, m6
6680    movq                m11, r4q
6681    punpcklbw           m11, m11
6682    psraw               m11, 8
6683    pshufb               m0, m14
6684    pshufb               m2, m14
6685    pshufb               m1, m14
6686    pshufb               m3, m14
6687    pshufb               m4, m14
6688    pshufb               m5, m14
6689    pmaddubsw            m0, m15
6690    pmaddubsw            m2, m15
6691    pmaddubsw            m1, m15
6692    pmaddubsw            m3, m15
6693    pmaddubsw            m4, m15
6694    pmaddubsw            m5, m15
6695    phaddw               m0, m2
6696    phaddw               m1, m3
6697    phaddw               m4, m5
6698    pmulhrsw             m0, m12    ; 0 2
6699    pmulhrsw             m1, m12    ; 1 3
6700    pmulhrsw             m4, m12    ; 4 5
6701    pshufd               m8, m11, q0000
6702    pshufd               m9, m11, q1111
6703    pshufd              m10, m11, q2222
6704    pshufd              m11, m11, q3333
6705%else
6706    pxor                 m3, m3
6707    pcmpeqd              m8, m3
6708    psrld               m14, 10
6709    pshufb              m14, [base+bdct_lb_dw]
6710    movu                 m1, [srcq+ssq*0]
6711    movu                 m2, [srcq+ssq*2]
6712    movu                 m3, [srcq+ssq*1]
6713    add                srcq, ss3q
6714    punpcklqdq           m6, m6
6715    SWAP                 m4, m7
6716    pand                 m7, m11, m8
6717    pandn                m8, m15
6718    SWAP                m15, m8
6719    por                 m15, m7
6720    paddb               m14, m6
6721    movu                 m0, [srcq+ssq*0]
6722    movu                 m7, [srcq+ssq*1]
6723    movu                 m6, [srcq+ssq*2]
6724    add                srcq, ss3q
6725    pshufb               m1, m14
6726    pshufb               m2, m14
6727    pshufb               m3, m14
6728    pshufb               m0, m14
6729    pshufb               m7, m14
6730    pshufb               m6, m14
6731    pmaddubsw            m1, m15
6732    pmaddubsw            m2, m15
6733    pmaddubsw            m3, m15
6734    mova         [esp+0x00], m14
6735    mova         [esp+0x10], m15
6736    pmaddubsw            m0, m15
6737    pmaddubsw            m7, m15
6738    pmaddubsw            m6, m15
6739 %define m14 [esp+0x00]
6740 %define m15 [esp+0x10]
6741    phaddw               m1, m2
6742    phaddw               m3, m0
6743    phaddw               m7, m6
6744 %ifidn %1, put
6745    mov                 dsd, dsm
6746  %define dstq r5
6747 %else
6748  %define tmpq r5
6749 %endif
6750    movd                 m6, r4
6751    movd                 m0, r5
6752    punpckldq            m6, m0
6753    punpcklbw            m6, m6
6754    psraw                m6, 8
6755    mov                  r5, r0m
6756    pmulhrsw             m1, m12 ; 0 2
6757    pmulhrsw             m3, m12 ; 1 3
6758    pmulhrsw             m7, m12 ; 4 5
6759    SWAP                 m0, m1, m3
6760    SWAP                 m4, m7
6761    pshufd               m2, m6, q0000
6762    pshufd               m3, m6, q1111
6763    pshufd               m7, m6, q2222
6764    pshufd               m6, m6, q3333
6765    mova         [esp+0x30], m2
6766    mova         [esp+0x40], m3
6767    mova         [esp+0x50], m7
6768    mova         [esp+0x60], m6
6769 %define m8  [esp+0x30]
6770 %define m9  [esp+0x40]
6771 %define m10 [esp+0x50]
6772 %define m11 [esp+0x60]
6773%endif
6774    psrldq               m5, m4, 8  ; 5 _
6775    punpckhwd            m2, m0, m1 ; 23
6776    punpcklwd            m0, m1     ; 01
6777    punpcklwd            m4, m5     ; 45
6778.dy2_w4_loop:
6779    pmaddwd              m0, m8         ; a0
6780    pmaddwd              m5, m2, m8     ; b0
6781    pmaddwd              m2, m9         ; a1
6782    pmaddwd              m7, m4, m9     ; b1
6783    pmaddwd              m3, m4, m10    ; a2
6784    paddd                m0, m13
6785    paddd                m5, m13
6786    paddd                m0, m2
6787    paddd                m5, m7
6788    paddd                m0, m3
6789    movu                 m6, [srcq+ssq*0]
6790    movu                 m7, [srcq+ssq*1]
6791    movu                 m3, [srcq+ssq*2]
6792    movu                 m1, [srcq+ss3q ]
6793    lea                srcq, [srcq+ssq*4]
6794    pshufb               m6, m14
6795    pshufb               m7, m14
6796    pshufb               m3, m14
6797    pshufb               m1, m14
6798    pmaddubsw            m6, m15
6799    pmaddubsw            m7, m15
6800    pmaddubsw            m3, m15
6801    pmaddubsw            m1, m15
6802    phaddw               m6, m7
6803    phaddw               m3, m1
6804    pmulhrsw             m6, m12    ; 6 7
6805    pmulhrsw             m3, m12    ; 8 9
6806    psrldq               m7, m6, 8
6807    psrldq               m1, m3, 8
6808    punpcklwd            m6, m7     ; 67
6809    punpcklwd            m3, m1     ; 89
6810    mova                 m2, m6
6811    pmaddwd              m1, m6, m10    ; b2
6812    pmaddwd              m6, m11        ; a3
6813    pmaddwd              m7, m3, m11    ; b3
6814    paddd                m5, m1
6815    paddd                m0, m6
6816    paddd                m5, m7
6817    psrad                m0, rndshift
6818    psrad                m5, rndshift
6819    packssdw             m0, m5
6820%ifidn %1, put
6821    packuswb             m0, m0
6822    psrldq               m1, m0, 4
6823    movd       [dstq+dsq*0], m0
6824    movd       [dstq+dsq*1], m1
6825    lea                dstq, [dstq+dsq*2]
6826%else
6827    mova             [tmpq], m0
6828    add                tmpq, 16
6829%endif
6830    mova                 m0, m4
6831    mova                 m4, m3
6832    sub                  hd, 2
6833    jg .dy2_w4_loop
6834    MC_8TAP_SCALED_RET
6835INIT_XMM ssse3
6836.dy2_w8:
6837    mov    dword [rsp+0x90], 1
6838    movifprep   tmp_stridem, 16
6839    jmp .dy2_w_start
6840.dy2_w16:
6841    mov    dword [rsp+0x90], 2
6842    movifprep   tmp_stridem, 32
6843    jmp .dy2_w_start
6844.dy2_w32:
6845    mov    dword [rsp+0x90], 4
6846    movifprep   tmp_stridem, 64
6847    jmp .dy2_w_start
6848.dy2_w64:
6849    mov    dword [rsp+0x90], 8
6850    movifprep   tmp_stridem, 128
6851    jmp .dy2_w_start
6852.dy2_w128:
6853    mov    dword [rsp+0x90], 16
6854    movifprep   tmp_stridem, 256
6855.dy2_w_start:
6856    mov                 myd, mym
6857%ifidn %1, put
6858    movifnidn           dsm, dsq
6859%endif
6860%if ARCH_X86_64
6861    shr                 t0d, 16
6862    sub                srcq, 3
6863    shr                 myd, 6
6864    mov                 r4d, 64 << 24
6865    lea                 myd, [t1+myq]
6866    cmovnz              r4q, [base+subpel_filters+myq*8]
6867    movd                m15, t0d
6868%else
6869 %define m10 [base+pd_0x3ff]
6870 %define m11 [base+pd_0x4000]
6871 %define m8   m0
6872 %define m9   m1
6873 %xdefine m14 m4
6874 %xdefine m15 m3
6875 %if isprep
6876  %define tmpq r0
6877  %define ssq ssm
6878 %else
6879  %define dstq r0
6880 %endif
6881    mov                  r5, [esp+0x1f0]
6882    mov                  r3, [esp+0x1f4]
6883    shr                  r5, 16
6884    sub                srcq, 3
6885    movd                m15, r5
6886    xor                  r5, r5
6887    shr                 myd, 6
6888    lea                  r3, [r3+myd]
6889    mov                  r4, 64 << 24
6890    cmovnz               r4, [base+subpel_filters+r3*8+0]
6891    cmovnz               r5, [base+subpel_filters+r3*8+4]
6892    mov                  r0, r0m
6893    mov                  r3, r3m
6894%endif
6895    pslld                m7, m8, 2 ; dx*4
6896    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
6897    pshufd              m15, m15, q0000
6898    paddd               m14, m8 ; mx+dx*[0-3]
6899%if ARCH_X86_64
6900    movq                 m3, r4q
6901    punpcklbw            m3, m3
6902    psraw                m3, 8
6903%else
6904    movd                 m5, r4
6905    movd                 m6, r5
6906    punpckldq            m5, m6
6907    punpcklbw            m5, m5
6908    psraw                m5, 8
6909    SWAP                 m3, m5
6910%endif
6911    mova        [rsp+0x100], m7
6912    mova        [rsp+0x120], m15
6913    mov         [rsp+0x098], srcq
6914    mov         [rsp+0x130], r0q ; dstq / tmpq
6915    pshufd               m0, m3, q0000
6916    pshufd               m1, m3, q1111
6917    pshufd               m2, m3, q2222
6918    pshufd               m3, m3, q3333
6919    mova        [rsp+0x140], m0
6920    mova        [rsp+0x150], m1
6921    mova        [rsp+0x160], m2
6922    mova        [rsp+0x170], m3
6923%if ARCH_X86_64 && UNIX64
6924    mov                  hm, hd
6925%elif ARCH_X86_32
6926    SWAP                  m5, m3
6927    mov                   r5, hm
6928    mov          [esp+0x134], r5
6929%endif
6930    jmp .dy2_hloop
6931.dy2_hloop_prep:
6932    dec   dword [rsp+0x090]
6933    jz .ret
6934%if ARCH_X86_64
6935    add   qword [rsp+0x130], 8*(isprep+1)
6936    mov                  hd, hm
6937%else
6938    add   dword [rsp+0x130], 8*(isprep+1)
6939    mov                  r5, [esp+0x134]
6940    mov                  r0, [esp+0x130]
6941%endif
6942    mova                 m7, [rsp+0x100]
6943    mova                m14, [rsp+0x110]
6944%if ARCH_X86_64
6945    mova                m10, [base+pd_0x3ff]
6946%else
6947 %define m10 [base+pd_0x3ff]
6948%endif
6949    mova                m15, [rsp+0x120]
6950    mov                srcq, [rsp+0x098]
6951%if ARCH_X86_64
6952    mov                 r0q, [rsp+0x130] ; dstq / tmpq
6953%else
6954    mov                  hm, r5
6955    mov                 r0m, r0
6956    mov                  r3, r3m
6957%endif
6958    paddd               m14, m7
6959.dy2_hloop:
6960    pxor                 m9, m9
6961%if ARCH_X86_64
6962    mova                m11, [base+pq_0x40000000]
6963%else
6964 %define m11 [base+pq_0x40000000]
6965%endif
6966    psrld                m2, m14, 10
6967    mova              [rsp], m2
6968    pand                 m6, m14, m10
6969    psrld                m6, 6
6970    paddd                m5, m15, m6
6971    pcmpeqd              m6, m9
6972    psrldq               m2, m5, 8
6973%if ARCH_X86_64
6974    movd                r4d, m5
6975    movd                r6d, m2
6976    psrldq               m5, 4
6977    psrldq               m2, 4
6978    movd                r7d, m5
6979    movd                r9d, m2
6980    movq                 m0, [base+subpel_filters+r4*8]
6981    movq                 m1, [base+subpel_filters+r6*8]
6982    movhps               m0, [base+subpel_filters+r7*8]
6983    movhps               m1, [base+subpel_filters+r9*8]
6984%else
6985    movd                 r0, m5
6986    movd                 rX, m2
6987    psrldq               m5, 4
6988    psrldq               m2, 4
6989    movd                 r4, m5
6990    movd                 r5, m2
6991    movq                 m0, [base+subpel_filters+r0*8]
6992    movq                 m1, [base+subpel_filters+rX*8]
6993    movhps               m0, [base+subpel_filters+r4*8]
6994    movhps               m1, [base+subpel_filters+r5*8]
6995    pxor                 m2, m2
6996 %define m9 m2
6997%endif
6998    paddd               m14, m7 ; mx+dx*[4-7]
6999    pand                 m5, m14, m10
7000    psrld                m5, 6
7001    paddd               m15, m5
7002    pcmpeqd              m5, m9
7003    mova        [rsp+0x110], m14
7004    psrldq               m4, m15, 8
7005%if ARCH_X86_64
7006    movd               r10d, m15
7007    movd               r11d, m4
7008    psrldq              m15, 4
7009    psrldq               m4, 4
7010    movd               r13d, m15
7011    movd                rXd, m4
7012    movq                 m2, [base+subpel_filters+r10*8]
7013    movq                 m3, [base+subpel_filters+r11*8]
7014    movhps               m2, [base+subpel_filters+r13*8]
7015    movhps               m3, [base+subpel_filters+ rX*8]
7016    psrld               m14, 10
7017    psrldq               m4, m14, 8
7018    movd               r10d, m14
7019    movd               r11d, m4
7020    psrldq              m14, 4
7021    psrldq               m4, 4
7022    movd               r13d, m14
7023    movd                rXd, m4
7024    mov                 r4d, [rsp+ 0]
7025    mov                 r6d, [rsp+ 8]
7026    mov                 r7d, [rsp+ 4]
7027    mov                 r9d, [rsp+12]
7028    pshufd               m4, m6, q1100
7029    pshufd               m6, m6, q3322
7030    pshufd               m7, m5, q1100
7031    pshufd               m5, m5, q3322
7032    pand                 m8, m11, m4
7033    pand                 m9, m11, m6
7034    pand                m15, m11, m7
7035    pand                m11, m11, m5
7036    pandn                m4, m0
7037    pandn                m6, m1
7038    pandn                m7, m2
7039    pandn                m5, m3
7040    por                  m8, m4
7041    por                  m9, m6
7042    por                 m15, m7
7043    por                 m11, m5
7044    mova         [rsp+0x10], m8
7045    mova         [rsp+0x20], m9
7046    mova         [rsp+0x30], m15
7047    mova         [rsp+0x40], m11
7048    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
7049    mova         [rsp+0x50], m1
7050    mova         [rsp+0x60], m2
7051    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
7052    mova         [rsp+0x70], m3
7053    mova         [rsp+0x80], m4
7054    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
7055    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
7056    SWAP                 m7, m0
7057    SWAP                 m8, m14
7058    mova                 m1, [rsp+0x50]
7059    mova                 m2, [rsp+0x60]
7060    mova                 m3, [rsp+0x70]
7061    mova                m15, [rsp+0x80]
7062    punpcklwd            m4, m5, m6 ; 45a
7063    punpckhwd            m5, m6     ; 45b
7064    punpcklwd            m6, m7, m8 ; 67a
7065    punpckhwd            m7, m8     ; 67b
7066    SWAP                m14, m8
7067    mova                 m8, [rsp+0x140]
7068    mova                 m9, [rsp+0x150]
7069    mova                m10, [rsp+0x160]
7070    mova                m11, [rsp+0x170]
7071    punpcklwd            m0, m1, m2 ; 01a
7072    punpckhwd            m1, m2     ; 01b
7073    punpcklwd            m2, m3, m15; 23a
7074    punpckhwd            m3, m15    ; 23b
7075    mova         [rsp+0x50], m4
7076    mova         [rsp+0x60], m5
7077    mova         [rsp+0x70], m6
7078    mova         [rsp+0x80], m7
7079%else
7080    movd                 r0, m15
7081    movd                 rX, m4
7082    psrldq              m15, 4
7083    psrldq               m4, 4
7084    movd                 r4, m15
7085    movd                 r5, m4
7086    mova                m14, [esp+0x110]
7087    movq                 m2, [base+subpel_filters+r0*8]
7088    movq                 m3, [base+subpel_filters+rX*8]
7089    movhps               m2, [base+subpel_filters+r4*8]
7090    movhps               m3, [base+subpel_filters+r5*8]
7091    psrld               m14, 10
7092    mova           [esp+16], m14
7093    mov                  r0, [esp+ 0]
7094    mov                  rX, [esp+ 8]
7095    mov                  r4, [esp+ 4]
7096    mov                  r5, [esp+12]
7097    mova         [esp+0x20], m0
7098    mova         [esp+0x30], m1
7099    mova         [esp+0x40], m2
7100    mova         [esp+0x50], m3
7101    pshufd               m4, m6, q1100
7102    pshufd               m6, m6, q3322
7103    pshufd               m7, m5, q1100
7104    pshufd               m5, m5, q3322
7105    pand                 m0, m11, m4
7106    pand                 m1, m11, m6
7107    pand                 m2, m11, m7
7108    pand                 m3, m11, m5
7109    pandn                m4, [esp+0x20]
7110    pandn                m6, [esp+0x30]
7111    pandn                m7, [esp+0x40]
7112    pandn                m5, [esp+0x50]
7113    por                  m0, m4
7114    por                  m1, m6
7115    por                  m2, m7
7116    por                  m3, m5
7117    mova        [esp+0x20], m0
7118    mova        [esp+0x30], m1
7119    mova        [esp+0x40], m2
7120    mova        [esp+0x50], m3
7121    MC_8TAP_SCALED_H   0x20, 0x60, 0 ; 0-1
7122    MC_8TAP_SCALED_H   0x20, 0x180   ; 2-3
7123    MC_8TAP_SCALED_H   0x20, 0x1a0   ; 4-5
7124    MC_8TAP_SCALED_H   0x20, 0x1c0   ; 6-7
7125    mova                 m5, [esp+0x1a0]
7126    mova                 m6, [esp+0x1b0]
7127    mova                 m7, [esp+0x1c0]
7128    mova                 m0, [esp+0x1d0]
7129    punpcklwd            m4, m5, m6      ; 45a
7130    punpckhwd            m5, m6          ; 45b
7131    punpcklwd            m6, m7, m0      ; 67a
7132    punpckhwd            m7, m0          ; 67b
7133    mova        [esp+0x1a0], m4
7134    mova        [esp+0x1b0], m5
7135    mova        [esp+0x1c0], m6
7136    mova        [esp+0x1d0], m7
7137    mova                 m1, [esp+0x060]
7138    mova                 m2, [esp+0x070]
7139    mova                 m3, [esp+0x180]
7140    mova                 m4, [esp+0x190]
7141    punpcklwd            m0, m1, m2      ; 01a
7142    punpckhwd            m1, m2          ; 01b
7143    punpcklwd            m2, m3, m4      ; 23a
7144    punpckhwd            m3, m4          ; 23b
7145    mova        [esp+0x180], m2
7146    mova        [esp+0x190], m3
7147 %define m8  [esp+0x140]
7148 %define m9  [esp+0x150]
7149 %define m10 [esp+0x160]
7150 %define m11 [esp+0x170]
7151%endif
7152.dy2_vloop:
7153%if ARCH_X86_32
7154    mov                  r0, r0m
7155%endif
7156    pmaddwd              m4, m0, m8
7157    pmaddwd              m5, m1, m8
7158    pmaddwd              m6, m2, m9
7159    pmaddwd              m7, m3, m9
7160    paddd                m4, m13
7161    paddd                m5, m13
7162    paddd                m4, m6
7163    paddd                m5, m7
7164%if ARCH_X86_64
7165    pmaddwd              m6, [rsp+0x50], m10
7166    pmaddwd              m7, [rsp+0x60], m10
7167%else
7168    pmaddwd              m6, [esp+0x1a0], m10
7169    pmaddwd              m7, [esp+0x1b0], m10
7170%endif
7171    paddd                m4, m6
7172    paddd                m5, m7
7173%if ARCH_X86_64
7174    pmaddwd              m6, [rsp+0x70], m11
7175    pmaddwd              m7, [rsp+0x80], m11
7176%else
7177    pmaddwd              m6, [esp+0x1c0], m11
7178    pmaddwd              m7, [esp+0x1d0], m11
7179%endif
7180    paddd                m4, m6
7181    paddd                m5, m7
7182    psrad                m4, rndshift
7183    psrad                m5, rndshift
7184    packssdw             m4, m5
7185%ifidn %1, put
7186    packuswb             m4, m4
7187    movq             [dstq], m4
7188    add                dstq, dsm
7189%else
7190    mova             [tmpq], m4
7191    add                tmpq, tmp_stridem
7192%endif
7193%if ARCH_X86_32
7194    mov                 r0m, r0
7195%endif
7196    dec                  hd
7197    jz .dy2_hloop_prep
7198%if ARCH_X86_64
7199    mova                 m8, [rsp+0x10]
7200    mova                 m9, [rsp+0x20]
7201    mova                m10, [rsp+0x30]
7202    mova                m11, [rsp+0x40]
7203    mova                 m0, m2             ; 01a
7204    mova                 m1, m3             ; 01b
7205    MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11
7206    mova                 m3, [rsp+0x50] ; 23a
7207    mova                 m4, [rsp+0x60] ; 23b
7208    mova                 m5, [rsp+0x70] ; 45a
7209    mova                 m7, [rsp+0x80] ; 45b
7210    mova                 m8, [rsp+0x140]
7211    mova                 m9, [rsp+0x150]
7212    mova                m10, [rsp+0x160]
7213    mova                m11, [rsp+0x170]
7214    punpcklwd           m14, m2, m6     ; 67a
7215    punpckhwd            m2, m6         ; 67b
7216    mova         [rsp+0x50], m5
7217    mova         [rsp+0x60], m7
7218    mova         [rsp+0x70], m14
7219    mova         [rsp+0x80], m2
7220    mova                 m2, m3
7221    mova                 m3, m4
7222%else
7223    MC_8TAP_SCALED_H   0x20, 0
7224    punpcklwd            m6, m0, m4
7225    punpckhwd            m7, m0, m4
7226    mova                 m0, [esp+0x180] ; 01a
7227    mova                 m1, [esp+0x190] ; 01b
7228    mova                 m2, [rsp+0x1a0]  ; 23a
7229    mova                 m3, [esp+0x1b0]  ; 23b
7230    mova                 m4, [esp+0x1c0]  ; 45a
7231    mova                 m5, [esp+0x1d0]  ; 45b
7232    mova        [esp+0x180], m2
7233    mova        [esp+0x190], m3
7234    mova        [esp+0x1a0], m4
7235    mova        [esp+0x1b0], m5
7236    mova        [esp+0x1c0], m6          ; 67a
7237    mova        [esp+0x1d0], m7          ; 67b
7238%endif
7239    jmp .dy2_vloop
7240.ret:
7241    MC_8TAP_SCALED_RET 0
7242%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
7243 %define r0m [rstk+stack_offset+ 4]
7244 %define r1m [rstk+stack_offset+ 8]
7245 %define r2m [rstk+stack_offset+12]
7246 %define r3m [rstk+stack_offset+16]
7247%endif
7248%undef isprep
7249%endmacro
7250
7251%macro BILIN_SCALED_FN 1
7252cglobal %1_bilin_scaled_8bpc
7253    mov                 t0d, (5*15 << 16) | 5*15
7254    mov                 t1d, (5*15 << 16) | 5*15
7255    jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
7256%endmacro
7257
7258%if WIN64
7259DECLARE_REG_TMP 6, 5
7260%elif ARCH_X86_64
7261DECLARE_REG_TMP 6, 8
7262%else
7263DECLARE_REG_TMP 1, 2
7264%endif
7265BILIN_SCALED_FN put
7266FN put_8tap_scaled, sharp,          SHARP,   SHARP
7267FN put_8tap_scaled, sharp_smooth,   SHARP,   SMOOTH
7268FN put_8tap_scaled, smooth_sharp,   SMOOTH,  SHARP
7269FN put_8tap_scaled, smooth,         SMOOTH,  SMOOTH
7270FN put_8tap_scaled, sharp_regular,  SHARP,   REGULAR
7271FN put_8tap_scaled, regular_sharp,  REGULAR, SHARP
7272FN put_8tap_scaled, smooth_regular, SMOOTH,  REGULAR
7273FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
7274FN put_8tap_scaled, regular,        REGULAR, REGULAR
7275MC_8TAP_SCALED put
7276
7277%if WIN64
7278DECLARE_REG_TMP 5, 4
7279%elif ARCH_X86_64
7280DECLARE_REG_TMP 6, 7
7281%else
7282DECLARE_REG_TMP 1, 2
7283%endif
7284BILIN_SCALED_FN prep
7285FN prep_8tap_scaled, sharp,          SHARP,   SHARP
7286FN prep_8tap_scaled, sharp_smooth,   SHARP,   SMOOTH
7287FN prep_8tap_scaled, smooth_sharp,   SMOOTH,  SHARP
7288FN prep_8tap_scaled, smooth,         SMOOTH,  SMOOTH
7289FN prep_8tap_scaled, sharp_regular,  SHARP,   REGULAR
7290FN prep_8tap_scaled, regular_sharp,  REGULAR, SHARP
7291FN prep_8tap_scaled, smooth_regular, SMOOTH,  REGULAR
7292FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
7293FN prep_8tap_scaled, regular,        REGULAR, REGULAR
7294MC_8TAP_SCALED prep
7295
7296%if ARCH_X86_32
7297 %macro SAVE_ALPHA_BETA 0
7298    mov              alpham, alphad
7299    mov               betam, betad
7300 %endmacro
7301
7302 %macro SAVE_DELTA_GAMMA 0
7303    mov              deltam, deltad
7304    mov              gammam, gammad
7305 %endmacro
7306
7307 %macro LOAD_ALPHA_BETA_MX 0
7308    mov                 mym, myd
7309    mov              alphad, alpham
7310    mov               betad, betam
7311    mov                 mxd, mxm
7312 %endmacro
7313
7314 %macro LOAD_DELTA_GAMMA_MY 0
7315    mov                 mxm, mxd
7316    mov              deltad, deltam
7317    mov              gammad, gammam
7318    mov                 myd, mym
7319 %endmacro
7320
7321 %define PIC_reg r2
7322 %define PIC_base_offset $$
7323 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
7324%else
7325 %define SAVE_ALPHA_BETA
7326 %define SAVE_DELTA_GAMMA
7327 %define PIC_sym(sym) sym
7328%endif
7329
7330%if ARCH_X86_32
7331 %if STACK_ALIGNMENT < required_stack_alignment
7332  %assign copy_args 8*4
7333 %else
7334  %assign copy_args 0
7335 %endif
7336%endif
7337
7338%macro RELOC_ARGS 0
7339 %if copy_args
7340    mov                  r0, r0m
7341    mov                  r1, r1m
7342    mov                  r2, r2m
7343    mov                  r3, r3m
7344    mov                  r5, r5m
7345    mov                dstm, r0
7346    mov                 dsm, r1
7347    mov                srcm, r2
7348    mov                 ssm, r3
7349    mov                 mxm, r5
7350    mov                  r0, r6m
7351    mov                 mym, r0
7352 %endif
7353%endmacro
7354
7355%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
7356 %if cpuflag(sse4)
7357    pblendw              %1, %2, 0xAA
7358 %else
7359    pand                 %2, m10
7360    por                  %1, %2
7361 %endif
7362%endmacro
7363
7364%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
7365 %if ARCH_X86_32
7366  %define m8  m4
7367  %define m9  m5
7368  %define m14 m6
7369  %define m15 m7
7370  %define m11 m7
7371 %endif
7372 %if notcpuflag(ssse3) || ARCH_X86_32
7373    pxor                m11, m11
7374 %endif
7375    lea               tmp1d, [myq+deltaq*4]
7376    lea               tmp2d, [myq+deltaq*1]
7377    shr                 myd, 10
7378    shr               tmp1d, 10
7379    movq                 m2, [filterq+myq  *8] ; a
7380    movq                 m8, [filterq+tmp1q*8] ; e
7381    lea               tmp1d, [tmp2q+deltaq*4]
7382    lea                 myd, [tmp2q+deltaq*1]
7383    shr               tmp2d, 10
7384    shr               tmp1d, 10
7385    movq                 m3, [filterq+tmp2q*8] ; b
7386    movq                 m0, [filterq+tmp1q*8] ; f
7387    punpcklwd            m2, m3
7388    punpcklwd            m8, m0
7389    lea               tmp1d, [myq+deltaq*4]
7390    lea               tmp2d, [myq+deltaq*1]
7391    shr                 myd, 10
7392    shr               tmp1d, 10
7393    movq                 m0, [filterq+myq  *8] ; c
7394    movq                 m9, [filterq+tmp1q*8] ; g
7395    lea               tmp1d, [tmp2q+deltaq*4]
7396    lea                 myd, [tmp2q+gammaq]       ; my += gamma
7397    shr               tmp2d, 10
7398    shr               tmp1d, 10
7399    movq                 m3, [filterq+tmp2q*8] ; d
7400    movq                 m1, [filterq+tmp1q*8] ; h
7401    punpcklwd            m0, m3
7402    punpcklwd            m9, m1
7403    punpckldq            m1, m2, m0
7404    punpckhdq            m2, m0
7405    punpcklbw            m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
7406    punpckhbw            m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
7407    punpcklbw            m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
7408    punpckhbw           m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
7409    pmaddwd              m0, %3
7410    pmaddwd              m3, %5
7411    pmaddwd              m1, %7
7412    pmaddwd             m14, %9
7413    paddd                m0, m3
7414    paddd                m1, m14
7415    paddd                m0, m1
7416    mova                 %1, m0
7417 %if ARCH_X86_64
7418    SWAP                 m3, m14
7419 %endif
7420    punpckldq            m0, m8, m9
7421    punpckhdq            m8, m9
7422    punpcklbw            m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
7423    punpckhbw           m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
7424    punpcklbw            m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
7425    punpckhbw           m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
7426    pmaddwd              m1, %4
7427    pmaddwd             m14, %6
7428    pmaddwd              m2, %8
7429    pmaddwd             m15, %10
7430    paddd                m1, m14
7431    paddd                m2, m15
7432    paddd                m1, m2
7433    mova                 %2, m1
7434 %if ARCH_X86_64
7435    SWAP                m14, m3
7436 %endif
7437%endmacro
7438
7439%if ARCH_X86_64
7440 %define counterd r4d
7441%else
7442 %if copy_args == 0
7443  %define counterd dword r4m
7444 %else
7445  %define counterd dword [esp+stack_size-4*7]
7446 %endif
7447%endif
7448
7449%macro WARP_AFFINE_8X8T 0
7450%if ARCH_X86_64
7451cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts
7452%else
7453cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts
7454 %if copy_args
7455  %define tmpm [esp+stack_size-4*1]
7456  %define tsm  [esp+stack_size-4*2]
7457 %endif
7458%endif
7459    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main
7460.loop:
7461%if ARCH_X86_32
7462 %define m12 m4
7463 %define m13 m5
7464 %define m14 m6
7465 %define m15 m7
7466    mova                m12, [esp+0xC0]
7467    mova                m13, [esp+0xD0]
7468    mova                m14, [esp+0xE0]
7469    mova                m15, [esp+0xF0]
7470%endif
7471%if cpuflag(ssse3)
7472    psrad               m12, 13
7473    psrad               m13, 13
7474    psrad               m14, 13
7475    psrad               m15, 13
7476    packssdw            m12, m13
7477    packssdw            m14, m15
7478    mova                m13, [PIC_sym(pw_8192)]
7479    pmulhrsw            m12, m13 ; (x + (1 << 6)) >> 7
7480    pmulhrsw            m14, m13
7481%else
7482 %if ARCH_X86_32
7483  %define m10 m0
7484 %endif
7485    mova                m10, [PIC_sym(pd_16384)]
7486    paddd               m12, m10
7487    paddd               m13, m10
7488    paddd               m14, m10
7489    paddd               m15, m10
7490    psrad               m12, 15
7491    psrad               m13, 15
7492    psrad               m14, 15
7493    psrad               m15, 15
7494    packssdw            m12, m13
7495    packssdw            m14, m15
7496%endif
7497    mova       [tmpq+tsq*0], m12
7498    mova       [tmpq+tsq*2], m14
7499    dec            counterd
7500    jz   mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end
7501%if ARCH_X86_32
7502    mov                tmpm, tmpd
7503    mov                  r0, [esp+0x100]
7504    mov                  r1, [esp+0x104]
7505%endif
7506    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2
7507    lea                tmpq, [tmpq+tsq*4]
7508    jmp .loop
7509%endmacro
7510
7511%macro WARP_AFFINE_8X8 0
7512%if ARCH_X86_64
7513cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \
7514                              dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
7515                              filter, tmp1, delta, my, gamma
7516%else
7517cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \
7518                              dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
7519                              filter, tmp1, delta, my, gamma
7520 %define alphaq     r0
7521 %define alphad     r0
7522 %define alpham     [esp+gprsize+0x100]
7523 %define betaq      r1
7524 %define betad      r1
7525 %define betam      [esp+gprsize+0x104]
7526 %define deltaq     r0
7527 %define deltad     r0
7528 %define deltam     [esp+gprsize+0x108]
7529 %define gammaq     r1
7530 %define gammad     r1
7531 %define gammam     [esp+gprsize+0x10C]
7532 %define filterq    r3
7533 %define tmp1q      r4
7534 %define tmp1d      r4
7535 %define tmp1m      [esp+gprsize+0x110]
7536 %define myq        r5
7537 %define myd        r5
7538 %define mym        r6m
7539 %if copy_args
7540  %define dstm [esp+stack_size-4*1]
7541  %define dsm  [esp+stack_size-4*2]
7542  %define srcm [esp+stack_size-4*3]
7543  %define ssm  [esp+stack_size-4*4]
7544  %define mxm  [esp+stack_size-4*5]
7545  %define mym  [esp+stack_size-4*6]
7546 %endif
7547%endif
7548    call .main
7549    jmp .start
7550.loop:
7551%if ARCH_X86_32
7552    mov                dstm, dstd
7553    mov              alphad, [esp+0x100]
7554    mov               betad, [esp+0x104]
7555%endif
7556    call .main2
7557    lea                dstq, [dstq+dsq*2]
7558.start:
7559%if notcpuflag(sse4)
7560 %if cpuflag(ssse3)
7561  %define roundval pw_8192
7562 %else
7563  %define roundval pd_262144
7564 %endif
7565 %if ARCH_X86_64
7566    mova                m10, [PIC_sym(roundval)]
7567 %else
7568  %define m10 [PIC_sym(roundval)]
7569 %endif
7570%endif
7571%if ARCH_X86_32
7572 %define m12 m5
7573 %define m13 m6
7574    mova                m12, [esp+0xC0]
7575    mova                m13, [esp+0xD0]
7576%endif
7577%if cpuflag(sse4)
7578 %if ARCH_X86_32
7579  %define m11 m4
7580    pxor                m11, m11
7581 %endif
7582    psrad               m12, 18
7583    psrad               m13, 18
7584    packusdw            m12, m13
7585    pavgw               m12, m11 ; (x + (1 << 10)) >> 11
7586%else
7587 %if cpuflag(ssse3)
7588    psrad               m12, 17
7589    psrad               m13, 17
7590    packssdw            m12, m13
7591    pmulhrsw            m12, m10
7592 %else
7593    paddd               m12, m10
7594    paddd               m13, m10
7595    psrad               m12, 19
7596    psrad               m13, 19
7597    packssdw            m12, m13
7598 %endif
7599%endif
7600%if ARCH_X86_32
7601 %define m14 m6
7602 %define m15 m7
7603    mova                m14, [esp+0xE0]
7604    mova                m15, [esp+0xF0]
7605%endif
7606%if cpuflag(sse4)
7607    psrad               m14, 18
7608    psrad               m15, 18
7609    packusdw            m14, m15
7610    pavgw               m14, m11 ; (x + (1 << 10)) >> 11
7611%else
7612 %if cpuflag(ssse3)
7613    psrad               m14, 17
7614    psrad               m15, 17
7615    packssdw            m14, m15
7616    pmulhrsw            m14, m10
7617 %else
7618    paddd               m14, m10
7619    paddd               m15, m10
7620    psrad               m14, 19
7621    psrad               m15, 19
7622    packssdw            m14, m15
7623 %endif
7624%endif
7625    packuswb            m12, m14
7626    movq       [dstq+dsq*0], m12
7627    movhps     [dstq+dsq*1], m12
7628    dec            counterd
7629    jg .loop
7630.end:
7631    RET
7632ALIGN function_align
7633.main:
7634%assign stack_offset stack_offset+gprsize
7635%if ARCH_X86_32
7636 %assign stack_size stack_size+4
7637 %if copy_args
7638  %assign stack_offset stack_offset-4
7639 %endif
7640    RELOC_ARGS
7641    LEA             PIC_reg, $$
7642 %define PIC_mem [esp+gprsize+0x114]
7643    mov               abcdd, abcdm
7644 %if copy_args == 0
7645    mov                 ssd, ssm
7646    mov                 mxd, mxm
7647 %endif
7648    mov             PIC_mem, PIC_reg
7649    mov                srcd, srcm
7650%endif
7651    movsx            deltad, word [abcdq+2*2]
7652    movsx            gammad, word [abcdq+2*3]
7653    lea               tmp1d, [deltaq*3]
7654    sub              gammad, tmp1d    ; gamma -= delta*3
7655    SAVE_DELTA_GAMMA
7656%if ARCH_X86_32
7657    mov               abcdd, abcdm
7658%endif
7659    movsx            alphad, word [abcdq+2*0]
7660    movsx             betad, word [abcdq+2*1]
7661    lea               tmp1q, [ssq*3+3]
7662    add                 mxd, 512+(64<<10)
7663    lea               tmp2d, [alphaq*3]
7664    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
7665%if ARCH_X86_32
7666    mov                srcm, srcd
7667    mov             PIC_reg, PIC_mem
7668%endif
7669    sub               betad, tmp2d    ; beta -= alpha*3
7670    lea             filterq, [PIC_sym(mc_warp_filter2)]
7671%if ARCH_X86_64
7672    mov                 myd, r6m
7673 %if cpuflag(ssse3)
7674    pxor                m11, m11
7675 %endif
7676%endif
7677    call .h
7678    psrld                m2, m0, 16
7679    psrld                m3, m1, 16
7680%if ARCH_X86_32
7681 %if notcpuflag(ssse3)
7682    mova [esp+gprsize+0x00], m2
7683 %endif
7684    mova [esp+gprsize+0x10], m3
7685%endif
7686    call .h
7687    psrld                m4, m0, 16
7688    psrld                m5, m1, 16
7689%if ARCH_X86_32
7690    mova [esp+gprsize+0x20], m4
7691    mova [esp+gprsize+0x30], m5
7692%endif
7693    call .h
7694%if ARCH_X86_64
7695 %define blendmask [rsp+gprsize+0x80]
7696%else
7697 %if notcpuflag(ssse3)
7698    mova                 m2, [esp+gprsize+0x00]
7699 %endif
7700    mova                 m3, [esp+gprsize+0x10]
7701 %define blendmask [esp+gprsize+0x120]
7702 %define m10 m7
7703%endif
7704    pcmpeqd             m10, m10
7705    pslld               m10, 16
7706    mova          blendmask, m10
7707    BLENDHWDW            m2, m0 ; 0
7708    BLENDHWDW            m3, m1 ; 2
7709    mova [rsp+gprsize+0x00], m2
7710    mova [rsp+gprsize+0x10], m3
7711    call .h
7712%if ARCH_X86_32
7713    mova                 m4, [esp+gprsize+0x20]
7714    mova                 m5, [esp+gprsize+0x30]
7715%endif
7716    mova                m10, blendmask
7717    BLENDHWDW            m4, m0 ; 1
7718    BLENDHWDW            m5, m1 ; 3
7719    mova [rsp+gprsize+0x20], m4
7720    mova [rsp+gprsize+0x30], m5
7721    call .h
7722%if ARCH_X86_32
7723 %if notcpuflag(ssse3)
7724    mova                 m2, [esp+gprsize+0x00]
7725 %endif
7726    mova                 m3, [esp+gprsize+0x10]
7727 %define m10 m5
7728%endif
7729    psrld                m6, m2, 16
7730    psrld                m7, m3, 16
7731    mova                m10, blendmask
7732    BLENDHWDW            m6, m0 ; 2
7733    BLENDHWDW            m7, m1 ; 4
7734    mova [rsp+gprsize+0x40], m6
7735    mova [rsp+gprsize+0x50], m7
7736    call .h
7737%if ARCH_X86_32
7738    mova                m4, [esp+gprsize+0x20]
7739    mova                m5, [esp+gprsize+0x30]
7740%endif
7741    psrld               m2, m4, 16
7742    psrld               m3, m5, 16
7743    mova                m10, blendmask
7744    BLENDHWDW           m2, m0 ; 3
7745    BLENDHWDW           m3, m1 ; 5
7746    mova [rsp+gprsize+0x60], m2
7747    mova [rsp+gprsize+0x70], m3
7748    call .h
7749%if ARCH_X86_32
7750    mova                 m6, [esp+gprsize+0x40]
7751    mova                 m7, [esp+gprsize+0x50]
7752 %define m10 m7
7753%endif
7754    psrld                m4, m6, 16
7755    psrld                m5, m7, 16
7756    mova                m10, blendmask
7757    BLENDHWDW            m4, m0 ; 4
7758    BLENDHWDW            m5, m1 ; 6
7759%if ARCH_X86_64
7760    add                 myd, 512+(64<<10)
7761    mova                 m6, m2
7762    mova                 m7, m3
7763%else
7764    mova [esp+gprsize+0x80], m4
7765    mova [esp+gprsize+0x90], m5
7766    add           dword mym, 512+(64<<10)
7767%endif
7768    mov            counterd, 4
7769    SAVE_ALPHA_BETA
7770.main2:
7771    call .h
7772%if ARCH_X86_32
7773    mova                 m6, [esp+gprsize+0x60]
7774    mova                 m7, [esp+gprsize+0x70]
7775 %define m10 m5
7776%endif
7777    psrld                m6, 16
7778    psrld                m7, 16
7779    mova                m10, blendmask
7780    BLENDHWDW            m6, m0 ; 5
7781    BLENDHWDW            m7, m1 ; 7
7782%if ARCH_X86_64
7783    WARP_V              m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
7784                                  m4, m5, \
7785                                  [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
7786                                  m6, m7
7787%else
7788    mova [esp+gprsize+0xA0], m6
7789    mova [esp+gprsize+0xB0], m7
7790    LOAD_DELTA_GAMMA_MY
7791    WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
7792           [esp+gprsize+0x00], [esp+gprsize+0x10], \
7793           [esp+gprsize+0x80], [esp+gprsize+0x90], \
7794           [esp+gprsize+0x20], [esp+gprsize+0x30], \
7795           [esp+gprsize+0xA0], [esp+gprsize+0xB0]
7796    LOAD_ALPHA_BETA_MX
7797%endif
7798    call .h
7799    mova                 m2, [rsp+gprsize+0x40]
7800    mova                 m3, [rsp+gprsize+0x50]
7801%if ARCH_X86_32
7802    mova                 m4, [rsp+gprsize+0x80]
7803    mova                 m5, [rsp+gprsize+0x90]
7804 %define m10 m7
7805%endif
7806    mova [rsp+gprsize+0x00], m2
7807    mova [rsp+gprsize+0x10], m3
7808    mova [rsp+gprsize+0x40], m4
7809    mova [rsp+gprsize+0x50], m5
7810    psrld                m4, 16
7811    psrld                m5, 16
7812    mova                m10, blendmask
7813    BLENDHWDW            m4, m0 ; 6
7814    BLENDHWDW            m5, m1 ; 8
7815%if ARCH_X86_64
7816    WARP_V              m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
7817                                  m6, m7, \
7818                                  [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
7819                                  m4, m5
7820%else
7821    mova [esp+gprsize+0x80], m4
7822    mova [esp+gprsize+0x90], m5
7823    LOAD_DELTA_GAMMA_MY
7824    WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
7825           [esp+gprsize+0x20], [esp+gprsize+0x30], \
7826           [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
7827           [esp+gprsize+0x00], [esp+gprsize+0x10], \
7828           [esp+gprsize+0x80], [esp+gprsize+0x90]
7829    mov                 mym, myd
7830    mov                dstd, dstm
7831    mov                 dsd, dsm
7832    mov                 mxd, mxm
7833%endif
7834    mova                 m2, [rsp+gprsize+0x60]
7835    mova                 m3, [rsp+gprsize+0x70]
7836%if ARCH_X86_32
7837    mova                 m6, [esp+gprsize+0xA0]
7838    mova                 m7, [esp+gprsize+0xB0]
7839%endif
7840    mova [rsp+gprsize+0x20], m2
7841    mova [rsp+gprsize+0x30], m3
7842    mova [rsp+gprsize+0x60], m6
7843    mova [rsp+gprsize+0x70], m7
7844    ret
7845ALIGN function_align
7846.h:
7847%if ARCH_X86_32
7848 %define m8  m3
7849 %define m9  m4
7850 %define m10 m5
7851 %define m14 m6
7852 %define m15 m7
7853%endif
7854    lea               tmp1d, [mxq+alphaq*4]
7855    lea               tmp2d, [mxq+alphaq*1]
7856%if ARCH_X86_32
7857 %assign stack_offset stack_offset+4
7858 %assign stack_size stack_size+4
7859 %define PIC_mem [esp+gprsize*2+0x114]
7860    mov             PIC_mem, PIC_reg
7861    mov                srcd, srcm
7862%endif
7863    movu                m10, [srcq]
7864%if ARCH_X86_32
7865    add                srcd, ssm
7866    mov                srcm, srcd
7867    mov             PIC_reg, PIC_mem
7868%else
7869    add                srcq, ssq
7870%endif
7871    shr                 mxd, 10
7872    shr               tmp1d, 10
7873    movq                 m1, [filterq+mxq  *8]  ; 0 X
7874    movq                 m8, [filterq+tmp1q*8]  ; 4 X
7875    lea               tmp1d, [tmp2q+alphaq*4]
7876    lea                 mxd, [tmp2q+alphaq*1]
7877    shr               tmp2d, 10
7878    shr               tmp1d, 10
7879    movhps               m1, [filterq+tmp2q*8]  ; 0 1
7880    movhps               m8, [filterq+tmp1q*8]  ; 4 5
7881    lea               tmp1d, [mxq+alphaq*4]
7882    lea               tmp2d, [mxq+alphaq*1]
7883    shr                 mxd, 10
7884    shr               tmp1d, 10
7885%if cpuflag(ssse3)
7886    movq                m14, [filterq+mxq  *8]  ; 2 X
7887    movq                 m9, [filterq+tmp1q*8]  ; 6 X
7888    lea               tmp1d, [tmp2q+alphaq*4]
7889    lea                 mxd, [tmp2q+betaq]  ; mx += beta
7890    shr               tmp2d, 10
7891    shr               tmp1d, 10
7892    movhps              m14, [filterq+tmp2q*8]  ; 2 3
7893    movhps               m9, [filterq+tmp1q*8]  ; 6 7
7894    pshufb               m0, m10, [PIC_sym(warp_8x8_shufA)]
7895    pmaddubsw            m0, m1
7896    pshufb               m1, m10, [PIC_sym(warp_8x8_shufB)]
7897    pmaddubsw            m1, m8
7898    pshufb              m15, m10, [PIC_sym(warp_8x8_shufC)]
7899    pmaddubsw           m15, m14
7900    pshufb              m10, m10, [PIC_sym(warp_8x8_shufD)]
7901    pmaddubsw           m10, m9
7902    phaddw               m0, m15
7903    phaddw               m1, m10
7904%else
7905 %if ARCH_X86_32
7906  %define m11 m2
7907 %endif
7908    pcmpeqw              m0, m0
7909    psrlw               m14, m0, 8
7910    psrlw               m15, m10, 8     ; 01 03 05 07  09 11 13 15
7911    pand                m14, m10        ; 00 02 04 06  08 10 12 14
7912    packuswb            m14, m15        ; 00 02 04 06  08 10 12 14  01 03 05 07  09 11 13 15
7913    psrldq               m9, m0, 4
7914    pshufd               m0, m14, q0220
7915    pand                 m0, m9
7916    psrldq              m14, 1          ; 02 04 06 08  10 12 14 01  03 05 07 09  11 13 15 __
7917    pslldq              m15, m14, 12
7918    por                  m0, m15    ; shufA
7919    psrlw               m15, m0, 8
7920    psraw               m11, m1, 8
7921    psllw                m0, 8
7922    psllw                m1, 8
7923    psrlw                m0, 8
7924    psraw                m1, 8
7925    pmullw              m15, m11
7926    pmullw               m0, m1
7927    paddw                m0, m15    ; pmaddubsw m0, m1
7928    pshufd              m15, m14, q0220
7929    pand                m15, m9
7930    psrldq              m14, 1          ; 04 06 08 10  12 14 01 03  05 07 09 11  13 15 __ __
7931    pslldq               m1, m14, 12
7932    por                 m15, m1     ; shufC
7933    pshufd               m1, m14, q0220
7934    pand                 m1, m9
7935    psrldq              m14, 1          ; 06 08 10 12  14 01 03 05  07 09 11 13  15 __ __ __
7936    pslldq              m11, m14, 12
7937    por                  m1, m11    ; shufB
7938    pshufd              m10, m14, q0220
7939    pand                m10, m9
7940    psrldq              m14, 1          ; 08 10 12 14  01 03 05 07  09 11 13 15  __ __ __ __
7941    pslldq              m14, m14, 12
7942    por                 m10, m14    ; shufD
7943    psrlw                m9, m1, 8
7944    psraw               m11, m8, 8
7945    psllw                m1, 8
7946    psllw                m8, 8
7947    psrlw                m1, 8
7948    psraw                m8, 8
7949    pmullw               m9, m11
7950    pmullw               m1, m8
7951    paddw                m1, m9     ; pmaddubsw m1, m8
7952    movq                m14, [filterq+mxq  *8]  ; 2 X
7953    movq                 m9, [filterq+tmp1q*8]  ; 6 X
7954    lea               tmp1d, [tmp2q+alphaq*4]
7955    lea                 mxd, [tmp2q+betaq]  ; mx += beta
7956    shr               tmp2d, 10
7957    shr               tmp1d, 10
7958    movhps              m14, [filterq+tmp2q*8]  ; 2 3
7959    movhps               m9, [filterq+tmp1q*8]  ; 6 7
7960    psrlw                m8, m15, 8
7961    psraw               m11, m14, 8
7962    psllw               m15, 8
7963    psllw               m14, 8
7964    psrlw               m15, 8
7965    psraw               m14, 8
7966    pmullw               m8, m11
7967    pmullw              m15, m14
7968    paddw               m15, m8     ; pmaddubsw m15, m14
7969    psrlw                m8, m10, 8
7970    psraw               m11, m9, 8
7971    psllw               m10, 8
7972    psllw                m9, 8
7973    psrlw               m10, 8
7974    psraw                m9, 8
7975    pmullw               m8, m11
7976    pmullw              m10, m9
7977    paddw               m10, m8     ; pmaddubsw m10, m9
7978    pslld                m8, m0, 16
7979    pslld                m9, m1, 16
7980    pslld               m14, m15, 16
7981    pslld               m11, m10, 16
7982    paddw                m0, m8
7983    paddw                m1, m9
7984    paddw               m15, m14
7985    paddw               m10, m11
7986    psrad                m0, 16
7987    psrad                m1, 16
7988    psrad               m15, 16
7989    psrad               m10, 16
7990    packssdw             m0, m15    ; phaddw m0, m15
7991    packssdw             m1, m10    ; phaddw m1, m10
7992%endif
7993    mova                m14, [PIC_sym(pw_8192)]
7994    mova                 m9, [PIC_sym(pd_32768)]
7995    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
7996    pmaddwd              m1, m14
7997    paddd                m0, m9  ; rounded 14-bit result in upper 16 bits of dword
7998    paddd                m1, m9
7999    ret
8000%endmacro
8001
8002%if WIN64
8003DECLARE_REG_TMP 6, 4
8004%else
8005DECLARE_REG_TMP 6, 7
8006%endif
8007
8008%macro BIDIR_FN 1 ; op
8009    %1                    0
8010    lea            stride3q, [strideq*3]
8011    jmp                  wq
8012.w4_loop:
8013    %1_INC_PTR            2
8014    %1                    0
8015    lea                dstq, [dstq+strideq*4]
8016.w4: ; tile 4x
8017    movd   [dstq          ], m0      ; copy dw[0]
8018    pshuflw              m1, m0, q1032 ; swap dw[1] and dw[0]
8019    movd   [dstq+strideq*1], m1      ; copy dw[1]
8020    punpckhqdq           m0, m0      ; swap dw[3,2] with dw[1,0]
8021    movd   [dstq+strideq*2], m0      ; dw[2]
8022    psrlq                m0, 32      ; shift right in dw[3]
8023    movd   [dstq+stride3q ], m0      ; copy
8024    sub                  hd, 4
8025    jg .w4_loop
8026    RET
8027.w8_loop:
8028    %1_INC_PTR            2
8029    %1                    0
8030    lea                dstq, [dstq+strideq*2]
8031.w8:
8032    movq   [dstq          ], m0
8033    movhps [dstq+strideq*1], m0
8034    sub                  hd, 2
8035    jg .w8_loop
8036    RET
8037.w16_loop:
8038    %1_INC_PTR            2
8039    %1                    0
8040    lea                dstq, [dstq+strideq]
8041.w16:
8042    mova   [dstq          ], m0
8043    dec                  hd
8044    jg .w16_loop
8045    RET
8046.w32_loop:
8047    %1_INC_PTR            4
8048    %1                    0
8049    lea                dstq, [dstq+strideq]
8050.w32:
8051    mova   [dstq          ], m0
8052    %1                    2
8053    mova   [dstq + 16     ], m0
8054    dec                  hd
8055    jg .w32_loop
8056    RET
8057.w64_loop:
8058    %1_INC_PTR            8
8059    %1                    0
8060    add                dstq, strideq
8061.w64:
8062    %assign i 0
8063    %rep 4
8064    mova   [dstq + i*16   ], m0
8065    %assign i i+1
8066    %if i < 4
8067    %1                    2*i
8068    %endif
8069    %endrep
8070    dec                  hd
8071    jg .w64_loop
8072    RET
8073.w128_loop:
8074    %1_INC_PTR            16
8075    %1                    0
8076    add                dstq, strideq
8077.w128:
8078    %assign i 0
8079    %rep 8
8080    mova   [dstq + i*16   ], m0
8081    %assign i i+1
8082    %if i < 8
8083    %1                    2*i
8084    %endif
8085    %endrep
8086    dec                  hd
8087    jg .w128_loop
8088    RET
8089%endmacro
8090
8091%macro AVG 1 ; src_offset
8092    ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
8093    mova                 m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
8094    paddw                m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
8095    mova                 m1, [tmp1q+(%1+1)*mmsize]
8096    paddw                m1, [tmp2q+(%1+1)*mmsize]
8097    pmulhrsw             m0, m2
8098    pmulhrsw             m1, m2
8099    packuswb             m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
8100%endmacro
8101
8102%macro AVG_INC_PTR 1
8103    add               tmp1q, %1*mmsize
8104    add               tmp2q, %1*mmsize
8105%endmacro
8106
8107cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
8108    LEA                  r6, avg_ssse3_table
8109    tzcnt                wd, wm ; leading zeros
8110    movifnidn            hd, hm ; move h(stack) to h(register) if not already that register
8111    movsxd               wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
8112    mova                 m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
8113    add                  wq, r6
8114    BIDIR_FN            AVG
8115
8116%macro W_AVG 1 ; src_offset
8117    ; (a * weight + b * (16 - weight) + 128) >> 8
8118    ; = ((a - b) * weight + (b << 4) + 128) >> 8
8119    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
8120    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
8121    mova                 m2, [tmp1q+(%1+0)*mmsize]
8122    mova                 m0, m2
8123    psubw                m2, [tmp2q+(%1+0)*mmsize]
8124    mova                 m3, [tmp1q+(%1+1)*mmsize]
8125    mova                 m1, m3
8126    psubw                m3, [tmp2q+(%1+1)*mmsize]
8127    pmulhw               m2, m4
8128    pmulhw               m3, m4
8129    paddw                m0, m2
8130    paddw                m1, m3
8131    pmulhrsw             m0, m5
8132    pmulhrsw             m1, m5
8133    packuswb             m0, m1
8134%endmacro
8135
8136%define W_AVG_INC_PTR AVG_INC_PTR
8137
8138cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
8139    LEA                  r6, w_avg_ssse3_table
8140    tzcnt                wd, wm
8141    movd                 m4, r6m
8142    movifnidn            hd, hm
8143    pxor                 m0, m0
8144    movsxd               wq, dword [r6+wq*4]
8145    mova                 m5, [pw_2048+r6-w_avg_ssse3_table]
8146    pshufb               m4, m0
8147    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
8148    add                  wq, r6
8149    cmp           dword r6m, 7
8150    jg .weight_gt7
8151    mov                  r6, tmp1q
8152    psubw                m0, m4
8153    mov               tmp1q, tmp2q
8154    mova                 m4, m0 ; -weight
8155    mov               tmp2q, r6
8156.weight_gt7:
8157    BIDIR_FN          W_AVG
8158
8159%macro MASK 1 ; src_offset
8160    ; (a * m + b * (64 - m) + 512) >> 10
8161    ; = ((a - b) * m + (b << 6) + 512) >> 10
8162    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
8163    mova                 m3,     [maskq+(%1+0)*(mmsize/2)]
8164    mova                 m0,     [tmp2q+(%1+0)*mmsize] ; b
8165    psubw                m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
8166    mova                 m6, m3      ; m
8167    psubb                m3, m4, m6  ; -m
8168    paddw                m1, m1     ; (b - a) << 1
8169    paddb                m3, m3     ; -m << 1
8170    punpcklbw            m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
8171    pmulhw               m1, m2     ; (-m * (b - a)) << 10
8172    paddw                m0, m1     ; + b
8173    mova                 m1,     [tmp2q+(%1+1)*mmsize] ; b
8174    psubw                m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
8175    paddw                m2, m2  ; (b - a) << 1
8176    mova                 m6, m3  ; (-m << 1)
8177    punpckhbw            m3, m4, m6 ; (-m << 9)
8178    pmulhw               m2, m3 ; (-m << 9)
8179    paddw                m1, m2 ; (-m * (b - a)) << 10
8180    pmulhrsw             m0, m5 ; round
8181    pmulhrsw             m1, m5 ; round
8182    packuswb             m0, m1 ; interleave 16 -> 8
8183%endmacro
8184
8185%macro MASK_INC_PTR 1
8186    add               maskq, %1*mmsize/2
8187    add               tmp1q, %1*mmsize
8188    add               tmp2q, %1*mmsize
8189%endmacro
8190
8191%if ARCH_X86_64
8192cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
8193    movifnidn            hd, hm
8194%else
8195cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
8196%define hd dword r5m
8197%endif
8198%define base r6-mask_ssse3_table
8199    LEA                  r6, mask_ssse3_table
8200    tzcnt                wd, wm
8201    movsxd               wq, dword [r6+wq*4]
8202    pxor                 m4, m4
8203    mova                 m5, [base+pw_2048]
8204    add                  wq, r6
8205    mov               maskq, r6m
8206    BIDIR_FN           MASK
8207%undef hd
8208
8209%macro W_MASK_420_END 1-*
8210%rep %0
8211    call .main
8212    paddw                m2, [maskq+16*%1]
8213    mova      [maskq+16*%1], m2
8214    mova [dstq+strideq*1+16*(2*%1+0)], m0
8215    call .main
8216    psubw                m3, m7, m2
8217    psubw                m1, m7, [maskq+16*%1]
8218    psubw                m3, [dstq+strideq*1+16*(2*%1+1)]
8219    psrlw                m1, 2
8220    psrlw                m3, 2
8221    packuswb             m1, m3
8222    mova      [maskq+16*%1], m1
8223    mova [dstq+strideq*1+16*(2*%1+1)], m0
8224    %rotate 1
8225%endrep
8226%endmacro
8227
8228%if UNIX64
8229DECLARE_REG_TMP 7
8230%else
8231DECLARE_REG_TMP 5
8232%endif
8233
8234cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
8235%define base t0-w_mask_420_ssse3_table
8236    LEA                  t0, w_mask_420_ssse3_table
8237    tzcnt                wd, wm
8238    mov                 r6d, r7m ; sign
8239    sub               tmp2q, tmp1q
8240    movsxd               wq, [t0+wq*4]
8241    mova                 m6, [base+pw_2048]
8242    movddup              m7, [base+wm_420_sign+r6*8] ; 258 - sign
8243    add                  wq, t0
8244%if ARCH_X86_64
8245    mova                 m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
8246    movifnidn            hd, hm
8247%else
8248    %define              m8  [base+pw_6903]
8249    %define              hd  dword hm
8250%endif
8251    mov               maskq, maskmp
8252    call .main
8253    jmp                  wq
8254.w4_loop:
8255    call .main
8256    add               maskq, 4
8257    lea                dstq, [dstq+strideq*2]
8258.w4:
8259    pshufd               m3, m2, q2020
8260    pshufd               m2, m2, q3131
8261    psubw                m1, m7, m3
8262    psubw                m1, m2
8263    psrlw                m1, 2
8264    packuswb             m1, m1
8265    movd            [maskq], m1
8266    movd   [dstq+strideq*0], m0
8267    pshuflw              m1, m0, q1032
8268    movd   [dstq+strideq*1], m1
8269    punpckhqdq           m0, m0
8270    lea                dstq, [dstq+strideq*2]
8271    movd   [dstq+strideq*0], m0
8272    pshuflw              m1, m0, q1032
8273    movd   [dstq+strideq*1], m1
8274    sub                  hd, 4
8275    jg .w4_loop
8276    RET
8277.w8_loop:
8278    call .main
8279    add               maskq, 4
8280    lea                dstq, [dstq+strideq*2]
8281.w8:
8282    movhlps              m3, m2
8283    psubw                m1, m7, m2
8284    psubw                m1, m3
8285    psrlw                m1, 2
8286    packuswb             m1, m1
8287    movd            [maskq], m1
8288    movq   [dstq+strideq*0], m0
8289    movhps [dstq+strideq*1], m0
8290    sub                  hd, 2
8291    jg .w8_loop
8292    RET
8293.w16_loop:
8294    call .main
8295    add               maskq, 8
8296    lea                dstq, [dstq+strideq*2]
8297.w16:
8298    mova   [dstq+strideq*1], m2
8299    mova   [dstq+strideq*0], m0
8300    call .main
8301    psubw                m1, m7, [dstq+strideq*1]
8302    psubw                m1, m2
8303    psrlw                m1, 2
8304    packuswb             m1, m1
8305    movq            [maskq], m1
8306    mova   [dstq+strideq*1], m0
8307    sub                  hd, 2
8308    jg .w16_loop
8309    RET
8310.w32_loop:
8311    call .main
8312    add               maskq, 16
8313    lea                dstq, [dstq+strideq*2]
8314.w32:
8315    mova            [maskq], m2
8316    mova [dstq+strideq*0+16*0], m0
8317    call .main
8318    mova [dstq+strideq*1+16*1], m2
8319    mova [dstq+strideq*0+16*1], m0
8320    W_MASK_420_END        0
8321    sub                  hd, 2
8322    jg .w32_loop
8323    RET
8324.w64_loop:
8325    call .main
8326    add               maskq, 16*2
8327    lea                dstq, [dstq+strideq*2]
8328.w64:
8329    mova       [maskq+16*0], m2
8330    mova [dstq+strideq*0+16*0], m0
8331    call .main
8332    mova [dstq+strideq*1+16*1], m2
8333    mova [dstq+strideq*0+16*1], m0
8334    call .main
8335    mova       [maskq+16*1], m2
8336    mova [dstq+strideq*0+16*2], m0
8337    call .main
8338    mova [dstq+strideq*1+16*3], m2
8339    mova [dstq+strideq*0+16*3], m0
8340    W_MASK_420_END        0, 1
8341    sub                  hd, 2
8342    jg .w64_loop
8343    RET
8344.w128_loop:
8345    call .main
8346    add               maskq, 16*4
8347    lea                dstq, [dstq+strideq*2]
8348.w128:
8349    mova       [maskq+16*0], m2
8350    mova [dstq+strideq*0+16*0], m0
8351    call .main
8352    mova [dstq+strideq*1+16*1], m2
8353    mova [dstq+strideq*0+16*1], m0
8354    call .main
8355    mova       [maskq+16*1], m2
8356    mova [dstq+strideq*0+16*2], m0
8357    call .main
8358    mova [dstq+strideq*1+16*3], m2
8359    mova [dstq+strideq*0+16*3], m0
8360    call .main
8361    mova       [maskq+16*2], m2
8362    mova [dstq+strideq*0+16*4], m0
8363    call .main
8364    mova [dstq+strideq*1+16*5], m2
8365    mova [dstq+strideq*0+16*5], m0
8366    call .main
8367    mova       [maskq+16*3], m2
8368    mova [dstq+strideq*0+16*6], m0
8369    call .main
8370    mova [dstq+strideq*1+16*7], m2
8371    mova [dstq+strideq*0+16*7], m0
8372    W_MASK_420_END        0, 1, 2, 3
8373    sub                  hd, 2
8374    jg .w128_loop
8375    RET
8376ALIGN function_align
8377.main:
8378    mova                 m0, [tmp1q      +16*0]
8379    mova                 m3, [tmp1q+tmp2q+16*0]
8380    mova                 m1, [tmp1q      +16*1]
8381    mova                 m4, [tmp1q+tmp2q+16*1]
8382    add               tmp1q, 16*2
8383    psubw                m3, m0
8384    psubw                m4, m1
8385    pabsw                m5, m3
8386    psubusw              m2, m8, m5
8387    psrlw                m2, 8 ; 64 - m
8388    psllw                m5, m2, 10
8389    pmulhw               m3, m5
8390    pabsw                m5, m4
8391    paddw                m0, m3
8392    psubusw              m3, m8, m5
8393    psrlw                m3, 8
8394    phaddw               m2, m3
8395    psllw                m3, 10
8396    pmulhw               m4, m3
8397    paddw                m1, m4
8398    pmulhrsw             m0, m6
8399    pmulhrsw             m1, m6
8400    packuswb             m0, m1
8401    ret
8402
8403%macro W_MASK_422_BACKUP 1 ; mask_offset
8404%if ARCH_X86_64
8405    mova                m10, m2
8406%else
8407    mova      [maskq+16*%1], m2
8408%endif
8409%endmacro
8410
8411%macro W_MASK_422_END 1 ; mask_offset
8412%if ARCH_X86_64
8413    packuswb            m10, m2
8414    psubb                m1, m7, m10
8415    pavgb                m1, m9
8416%else
8417    mova                 m3, [maskq+16*%1]
8418    packuswb             m3, m2
8419    pxor                 m2, m2
8420    psubb                m1, m7, m3
8421    pavgb                m1, m2
8422%endif
8423    mova      [maskq+16*%1], m1
8424%endmacro
8425
8426cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask
8427%define base t0-w_mask_422_ssse3_table
8428    LEA                  t0, w_mask_422_ssse3_table
8429    tzcnt                wd, wm
8430    mov                 r6d, r7m ; sign
8431    sub               tmp2q, tmp1q
8432    movsxd               wq, [t0+wq*4]
8433    mova                 m6, [base+pw_2048]
8434    movddup              m7, [base+wm_422_sign+r6*8] ; 128 - sign
8435    add                  wq, t0
8436%if ARCH_X86_64
8437    mova                 m8, [base+pw_6903]
8438    pxor                 m9, m9
8439    movifnidn            hd, hm
8440%else
8441    add                  t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table
8442    %define              hd  dword hm
8443%endif
8444    mov               maskq, maskmp
8445    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8446    jmp                  wq
8447.w4_loop:
8448    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8449    add               maskq, 8
8450    lea                dstq, [dstq+strideq*2]
8451.w4:
8452    packuswb             m2, m2
8453    psubb                m1, m7, m2
8454%if ARCH_X86_64
8455    pavgb                m1, m9
8456%else
8457    pxor                 m2, m2
8458    pavgb                m1, m2
8459%endif
8460    movq            [maskq], m1
8461    movd   [dstq+strideq*0], m0
8462    pshuflw              m1, m0, q1032
8463    movd   [dstq+strideq*1], m1
8464    punpckhqdq           m0, m0
8465    lea                dstq, [dstq+strideq*2]
8466    movd   [dstq+strideq*0], m0
8467    pshuflw              m1, m0, q1032
8468    movd   [dstq+strideq*1], m1
8469    sub                  hd, 4
8470    jg .w4_loop
8471    RET
8472.w8_loop:
8473    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8474    add               maskq, 16
8475    lea                dstq, [dstq+strideq*2]
8476.w8:
8477    W_MASK_422_BACKUP     0
8478    movq   [dstq+strideq*0], m0
8479    movhps [dstq+strideq*1], m0
8480    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8481    lea                dstq, [dstq+strideq*2]
8482    W_MASK_422_END        0
8483    movq   [dstq+strideq*0], m0
8484    movhps [dstq+strideq*1], m0
8485    sub                  hd, 4
8486    jg .w8_loop
8487    RET
8488.w16_loop:
8489    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8490    add               maskq, 16
8491    lea                dstq, [dstq+strideq*2]
8492.w16:
8493    W_MASK_422_BACKUP     0
8494    mova   [dstq+strideq*0], m0
8495    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8496    W_MASK_422_END        0
8497    mova   [dstq+strideq*1], m0
8498    sub                  hd, 2
8499    jg .w16_loop
8500    RET
8501.w32_loop:
8502    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8503    add               maskq, 16
8504    add                dstq, strideq
8505.w32:
8506    W_MASK_422_BACKUP     0
8507    mova        [dstq+16*0], m0
8508    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8509    W_MASK_422_END        0
8510    mova        [dstq+16*1], m0
8511    dec                  hd
8512    jg .w32_loop
8513    RET
8514.w64_loop:
8515    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8516    add               maskq, 16*2
8517    add                dstq, strideq
8518.w64:
8519    W_MASK_422_BACKUP     0
8520    mova        [dstq+16*0], m0
8521    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8522    W_MASK_422_END        0
8523    mova        [dstq+16*1], m0
8524    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8525    W_MASK_422_BACKUP     1
8526    mova        [dstq+16*2], m0
8527    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8528    W_MASK_422_END        1
8529    mova        [dstq+16*3], m0
8530    dec                  hd
8531    jg .w64_loop
8532    RET
8533.w128_loop:
8534    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8535    add               maskq, 16*4
8536    add                dstq, strideq
8537.w128:
8538    W_MASK_422_BACKUP     0
8539    mova        [dstq+16*0], m0
8540    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8541    W_MASK_422_END        0
8542    mova        [dstq+16*1], m0
8543    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8544    W_MASK_422_BACKUP     1
8545    mova        [dstq+16*2], m0
8546    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8547    W_MASK_422_END        1
8548    mova        [dstq+16*3], m0
8549    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8550    W_MASK_422_BACKUP     2
8551    mova        [dstq+16*4], m0
8552    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8553    W_MASK_422_END        2
8554    mova        [dstq+16*5], m0
8555    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8556    W_MASK_422_BACKUP     3
8557    mova        [dstq+16*6], m0
8558    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
8559    W_MASK_422_END        3
8560    mova        [dstq+16*7], m0
8561    dec                  hd
8562    jg .w128_loop
8563    RET
8564
8565cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
8566%define base t0-w_mask_444_ssse3_table
8567    LEA                  t0, w_mask_444_ssse3_table
8568    tzcnt                wd, wm
8569    mov               maskq, maskmp
8570    sub               tmp2q, tmp1q
8571    movsxd               wq, [t0+wq*4]
8572    mova                 m6, [base+pw_6903]
8573    mova                 m7, [base+pw_2048]
8574    add                  wq, t0
8575%if ARCH_X86_64
8576    mova                 m8, [base+pb_64]
8577    movifnidn            hd, hm
8578%else
8579    %define              m8  [base+pb_64]
8580    %define              hd  dword hm
8581%endif
8582    call .main
8583    jmp                  wq
8584.w4_loop:
8585    call .main
8586    lea                dstq, [dstq+strideq*2]
8587.w4:
8588    movd   [dstq+strideq*0], m0
8589    pshuflw              m1, m0, q1032
8590    movd   [dstq+strideq*1], m1
8591    punpckhqdq           m0, m0
8592    lea                dstq, [dstq+strideq*2]
8593    movd   [dstq+strideq*0], m0
8594    pshuflw              m1, m0, q1032
8595    movd   [dstq+strideq*1], m1
8596    sub                  hd, 4
8597    jg .w4_loop
8598    RET
8599.w8_loop:
8600    call .main
8601    lea                dstq, [dstq+strideq*2]
8602.w8:
8603    movq   [dstq+strideq*0], m0
8604    movhps [dstq+strideq*1], m0
8605    sub                  hd, 2
8606    jg .w8_loop
8607    RET
8608.w16_loop:
8609    call .main
8610    lea                dstq, [dstq+strideq*2]
8611.w16:
8612    mova   [dstq+strideq*0], m0
8613    call .main
8614    mova   [dstq+strideq*1], m0
8615    sub                  hd, 2
8616    jg .w16_loop
8617    RET
8618.w32_loop:
8619    call .main
8620    add                dstq, strideq
8621.w32:
8622    mova        [dstq+16*0], m0
8623    call .main
8624    mova        [dstq+16*1], m0
8625    dec                  hd
8626    jg .w32_loop
8627    RET
8628.w64_loop:
8629    call .main
8630    add                dstq, strideq
8631.w64:
8632    mova        [dstq+16*0], m0
8633    call .main
8634    mova        [dstq+16*1], m0
8635    call .main
8636    mova        [dstq+16*2], m0
8637    call .main
8638    mova        [dstq+16*3], m0
8639    dec                  hd
8640    jg .w64_loop
8641    RET
8642.w128_loop:
8643    call .main
8644    add                dstq, strideq
8645.w128:
8646    mova        [dstq+16*0], m0
8647    call .main
8648    mova        [dstq+16*1], m0
8649    call .main
8650    mova        [dstq+16*2], m0
8651    call .main
8652    mova        [dstq+16*3], m0
8653    call .main
8654    mova        [dstq+16*4], m0
8655    call .main
8656    mova        [dstq+16*5], m0
8657    call .main
8658    mova        [dstq+16*6], m0
8659    call .main
8660    mova        [dstq+16*7], m0
8661    dec                  hd
8662    jg .w128_loop
8663    RET
8664ALIGN function_align
8665.main:
8666    mova                 m0, [tmp1q      +16*0]
8667    mova                 m3, [tmp1q+tmp2q+16*0]
8668    mova                 m1, [tmp1q      +16*1]
8669    mova                 m4, [tmp1q+tmp2q+16*1]
8670    add               tmp1q, 16*2
8671    psubw                m3, m0
8672    psubw                m4, m1
8673    pabsw                m5, m3
8674    psubusw              m2, m6, m5
8675    psrlw                m2, 8 ; 64 - m
8676    psllw                m5, m2, 10
8677    pmulhw               m3, m5
8678    pabsw                m5, m4
8679    paddw                m0, m3
8680    psubusw              m3, m6, m5
8681    psrlw                m3, 8
8682    packuswb             m2, m3
8683    psllw                m3, 10
8684    pmulhw               m4, m3
8685    psubb                m3, m8, m2
8686    paddw                m1, m4
8687    pmulhrsw             m0, m7
8688    pmulhrsw             m1, m7
8689    mova            [maskq], m3
8690    add               maskq, 16
8691    packuswb             m0, m1
8692    ret
8693
8694%macro BLEND_64M 4; a, b, mask1, mask2
8695    punpcklbw            m0, %1, %2; {b;a}[7..0]
8696    punpckhbw            %1, %2    ; {b;a}[15..8]
8697    pmaddubsw            m0, %3    ; {b*m[0] + (64-m[0])*a}[7..0] u16
8698    pmaddubsw            %1, %4    ; {b*m[1] + (64-m[1])*a}[15..8] u16
8699    pmulhrsw             m0, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
8700    pmulhrsw             %1, m5    ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
8701    packuswb             m0, %1    ; {blendpx}[15..0] u8
8702%endmacro
8703
8704%macro BLEND 2; a, b
8705    psubb                m3, m4, m0 ; m3 = (64 - m)
8706    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
8707    punpckhbw            m3, m0     ; {m;(64-m)}[15..8]
8708    BLEND_64M            %1, %2, m2, m3
8709%endmacro
8710
8711cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
8712%define base r6-blend_ssse3_table
8713    LEA                  r6, blend_ssse3_table
8714    tzcnt                wd, wm
8715    movifnidn            hd, hm
8716    movifnidn         maskq, maskmp
8717    movsxd               wq, dword [r6+wq*4]
8718    mova                 m4, [base+pb_64]
8719    mova                 m5, [base+pw_512]
8720    add                  wq, r6
8721    lea                  r6, [dsq*3]
8722    jmp                  wq
8723.w4:
8724    movq                 m0, [maskq]; m
8725    movd                 m1, [dstq+dsq*0] ; a
8726    movd                 m6, [dstq+dsq*1]
8727    punpckldq            m1, m6
8728    movq                 m6, [tmpq] ; b
8729    psubb                m3, m4, m0 ; m3 = (64 - m)
8730    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
8731    punpcklbw            m1, m6    ; {b;a}[7..0]
8732    pmaddubsw            m1, m2    ; {b*m[0] + (64-m[0])*a}[7..0] u16
8733    pmulhrsw             m1, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
8734    packuswb             m1, m0    ; {blendpx}[15..0] u8
8735    movd       [dstq+dsq*0], m1
8736    psrlq                m1, 32
8737    movd       [dstq+dsq*1], m1
8738    add               maskq, 8
8739    add                tmpq, 8
8740    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
8741    sub                  hd, 2
8742    jg .w4
8743    RET
8744.w8:
8745    mova                 m0, [maskq]; m
8746    movq                 m1, [dstq+dsq*0] ; a
8747    movhps               m1, [dstq+dsq*1]
8748    mova                 m6, [tmpq] ; b
8749    BLEND                m1, m6
8750    movq       [dstq+dsq*0], m0
8751    movhps     [dstq+dsq*1], m0
8752    add               maskq, 16
8753    add                tmpq, 16
8754    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
8755    sub                  hd, 2
8756    jg .w8
8757    RET
8758.w16:
8759    mova                 m0, [maskq]; m
8760    mova                 m1, [dstq] ; a
8761    mova                 m6, [tmpq] ; b
8762    BLEND                m1, m6
8763    mova             [dstq], m0
8764    add               maskq, 16
8765    add                tmpq, 16
8766    add                dstq, dsq ; dst_stride
8767    dec                  hd
8768    jg .w16
8769    RET
8770.w32:
8771    %assign i 0
8772    %rep 2
8773    mova                 m0, [maskq+16*i]; m
8774    mova                 m1, [dstq+16*i] ; a
8775    mova                 m6, [tmpq+16*i] ; b
8776    BLEND                m1, m6
8777    mova        [dstq+i*16], m0
8778    %assign i i+1
8779    %endrep
8780    add               maskq, 32
8781    add                tmpq, 32
8782    add                dstq, dsq ; dst_stride
8783    dec                  hd
8784    jg .w32
8785    RET
8786
8787cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
8788%define base r5-blend_v_ssse3_table
8789    LEA                  r5, blend_v_ssse3_table
8790    tzcnt                wd, wm
8791    movifnidn            hd, hm
8792    movsxd               wq, dword [r5+wq*4]
8793    mova                 m5, [base+pw_512]
8794    add                  wq, r5
8795    add               maskq, obmc_masks-blend_v_ssse3_table
8796    jmp                  wq
8797.w2:
8798    movd                 m3, [maskq+4]
8799    punpckldq            m3, m3
8800    ; 2 mask blend is provided for 4 pixels / 2 lines
8801.w2_loop:
8802    movd                 m1, [dstq+dsq*0] ; a {..;a;a}
8803    pinsrw               m1, [dstq+dsq*1], 1
8804    movd                 m2, [tmpq] ; b
8805    punpcklbw            m0, m1, m2; {b;a}[7..0]
8806    pmaddubsw            m0, m3    ; {b*m + (64-m)*a}[7..0] u16
8807    pmulhrsw             m0, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
8808    packuswb             m0, m1    ; {blendpx}[8..0] u8
8809    movd                r3d, m0
8810    mov        [dstq+dsq*0], r3w
8811    shr                 r3d, 16
8812    mov        [dstq+dsq*1], r3w
8813    add                tmpq, 2*2
8814    lea                dstq, [dstq + dsq * 2]
8815    sub                  hd, 2
8816    jg .w2_loop
8817    RET
8818.w4:
8819    movddup              m3, [maskq+8]
8820    ; 4 mask blend is provided for 8 pixels / 2 lines
8821.w4_loop:
8822    movd                 m1, [dstq+dsq*0] ; a
8823    movd                 m2, [dstq+dsq*1] ;
8824    punpckldq            m1, m2
8825    movq                 m2, [tmpq] ; b
8826    punpcklbw            m1, m2    ; {b;a}[7..0]
8827    pmaddubsw            m1, m3    ; {b*m + (64-m)*a}[7..0] u16
8828    pmulhrsw             m1, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
8829    packuswb             m1, m1    ; {blendpx}[8..0] u8
8830    movd             [dstq], m1
8831    psrlq                m1, 32
8832    movd       [dstq+dsq*1], m1
8833    add                tmpq, 2*4
8834    lea                dstq, [dstq+dsq*2]
8835    sub                  hd, 2
8836    jg .w4_loop
8837    RET
8838.w8:
8839    mova                 m3, [maskq+16]
8840    ; 8 mask blend is provided for 16 pixels
8841.w8_loop:
8842    movq                 m1, [dstq+dsq*0] ; a
8843    movhps               m1, [dstq+dsq*1]
8844    mova                 m2, [tmpq]; b
8845    BLEND_64M            m1, m2, m3, m3
8846    movq       [dstq+dsq*0], m0
8847    movhps     [dstq+dsq*1], m0
8848    add                tmpq, 16
8849    lea                dstq, [dstq+dsq*2]
8850    sub                  hd, 2
8851    jg .w8_loop
8852    RET
8853.w16:
8854    ; 16 mask blend is provided for 32 pixels
8855    mova                  m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
8856    mova                  m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
8857.w16_loop:
8858    mova                 m1, [dstq] ; a
8859    mova                 m2, [tmpq] ; b
8860    BLEND_64M            m1, m2, m3, m4
8861    mova             [dstq], m0
8862    add                tmpq, 16
8863    add                dstq, dsq
8864    dec                  hd
8865    jg .w16_loop
8866    RET
8867.w32:
8868%if WIN64
8869    mova            [rsp+8], xmm6
8870%endif
8871    mova                 m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
8872    mova                 m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
8873    mova                 m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
8874    ; 16 mask blend is provided for 64 pixels
8875.w32_loop:
8876    mova                 m1, [dstq+16*0] ; a
8877    mova                 m2, [tmpq+16*0] ; b
8878    BLEND_64M            m1, m2, m3, m4
8879    movq                 m1, [dstq+16*1] ; a
8880    punpcklbw            m1, [tmpq+16*1] ; b
8881    pmaddubsw            m1, m6
8882    pmulhrsw             m1, m5
8883    packuswb             m1, m1
8884    mova        [dstq+16*0], m0
8885    movq        [dstq+16*1], m1
8886    add                tmpq, 32
8887    add                dstq, dsq
8888    dec                  hd
8889    jg .w32_loop
8890%if WIN64
8891    mova               xmm6, [rsp+8]
8892%endif
8893    RET
8894
8895cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
8896%define base t0-blend_h_ssse3_table
8897%if ARCH_X86_32
8898    ; We need to keep the PIC pointer for w4, reload wd from stack instead
8899    DECLARE_REG_TMP 6
8900%else
8901    DECLARE_REG_TMP 5
8902    mov                 r6d, wd
8903%endif
8904    LEA                  t0, blend_h_ssse3_table
8905    tzcnt                wd, wm
8906    mov                  hd, hm
8907    movsxd               wq, dword [t0+wq*4]
8908    mova                 m5, [base+pw_512]
8909    add                  wq, t0
8910    lea               maskq, [base+obmc_masks+hq*2]
8911    lea                  hd, [hq*3]
8912    shr                  hd, 2 ; h * 3/4
8913    lea               maskq, [maskq+hq*2]
8914    neg                  hq
8915    jmp                  wq
8916.w2:
8917    movd                 m0, [dstq+dsq*0]
8918    pinsrw               m0, [dstq+dsq*1], 1
8919    movd                 m2, [maskq+hq*2]
8920    movd                 m1, [tmpq]
8921    punpcklwd            m2, m2
8922    punpcklbw            m0, m1
8923    pmaddubsw            m0, m2
8924    pmulhrsw             m0, m5
8925    packuswb             m0, m0
8926    movd                r3d, m0
8927    mov        [dstq+dsq*0], r3w
8928    shr                 r3d, 16
8929    mov        [dstq+dsq*1], r3w
8930    lea                dstq, [dstq+dsq*2]
8931    add                tmpq, 2*2
8932    add                  hq, 2
8933    jl .w2
8934    RET
8935.w4:
8936%if ARCH_X86_32
8937    mova                 m3, [base+blend_shuf]
8938%else
8939    mova                 m3, [blend_shuf]
8940%endif
8941.w4_loop:
8942    movd                 m0, [dstq+dsq*0]
8943    movd                 m2, [dstq+dsq*1]
8944    punpckldq            m0, m2 ; a
8945    movq                 m1, [tmpq] ; b
8946    movq                 m2, [maskq+hq*2] ; m
8947    pshufb               m2, m3
8948    punpcklbw            m0, m1
8949    pmaddubsw            m0, m2
8950    pmulhrsw             m0, m5
8951    packuswb             m0, m0
8952    movd       [dstq+dsq*0], m0
8953    psrlq                m0, 32
8954    movd       [dstq+dsq*1], m0
8955    lea                dstq, [dstq+dsq*2]
8956    add                tmpq, 4*2
8957    add                  hq, 2
8958    jl .w4_loop
8959    RET
8960.w8:
8961    movd                 m4, [maskq+hq*2]
8962    punpcklwd            m4, m4
8963    pshufd               m3, m4, q0000
8964    pshufd               m4, m4, q1111
8965    movq                 m1, [dstq+dsq*0] ; a
8966    movhps               m1, [dstq+dsq*1]
8967    mova                 m2, [tmpq]
8968    BLEND_64M            m1, m2, m3, m4
8969    movq       [dstq+dsq*0], m0
8970    movhps     [dstq+dsq*1], m0
8971    lea                dstq, [dstq+dsq*2]
8972    add                tmpq, 8*2
8973    add                  hq, 2
8974    jl .w8
8975    RET
8976; w16/w32/w64/w128
8977.w16:
8978%if ARCH_X86_32
8979    mov                 r6d, wm
8980%endif
8981    sub                 dsq, r6
8982.w16_loop0:
8983    movd                 m3, [maskq+hq*2]
8984    pshuflw              m3, m3, q0000
8985    punpcklqdq           m3, m3
8986    mov                  wd, r6d
8987.w16_loop:
8988    mova                 m1, [dstq] ; a
8989    mova                 m2, [tmpq] ; b
8990    BLEND_64M            m1, m2, m3, m3
8991    mova             [dstq], m0
8992    add                dstq, 16
8993    add                tmpq, 16
8994    sub                  wd, 16
8995    jg .w16_loop
8996    add                dstq, dsq
8997    inc                  hq
8998    jl .w16_loop0
8999    RET
9000
9001; emu_edge args:
9002; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
9003; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
9004; const pixel *ref, const ptrdiff_t ref_stride
9005;
9006; bw, bh total filled size
9007; iw, ih, copied block -> fill bottom, right
9008; x, y, offset in bw/bh -> fill top, left
9009cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \
9010                                  y, dst, dstride, src, sstride, \
9011                                  bottomext, rightext, blk
9012    ; we assume that the buffer (stride) is larger than width, so we can
9013    ; safely overwrite by a few bytes
9014    pxor                 m1, m1
9015
9016%if ARCH_X86_64
9017 %define reg_zero       r12q
9018 %define reg_tmp        r10
9019 %define reg_src        srcq
9020 %define reg_bottomext  bottomextq
9021 %define reg_rightext   rightextq
9022 %define reg_blkm       r9m
9023%else
9024 %define reg_zero       r6
9025 %define reg_tmp        r0
9026 %define reg_src        r1
9027 %define reg_bottomext  r0
9028 %define reg_rightext   r1
9029 %define reg_blkm       r2m
9030%endif
9031    ;
9032    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
9033    xor            reg_zero, reg_zero
9034    lea             reg_tmp, [ihq-1]
9035    cmp                  yq, ihq
9036    cmovs           reg_tmp, yq
9037    test                 yq, yq
9038    cmovs           reg_tmp, reg_zero
9039%if ARCH_X86_64
9040    imul            reg_tmp, sstrideq
9041    add                srcq, reg_tmp
9042%else
9043    imul            reg_tmp, sstridem
9044    mov             reg_src, srcm
9045    add             reg_src, reg_tmp
9046%endif
9047    ;
9048    ; ref += iclip(x, 0, iw - 1)
9049    lea             reg_tmp, [iwq-1]
9050    cmp                  xq, iwq
9051    cmovs           reg_tmp, xq
9052    test                 xq, xq
9053    cmovs           reg_tmp, reg_zero
9054    add             reg_src, reg_tmp
9055%if ARCH_X86_32
9056    mov                srcm, reg_src
9057%endif
9058    ;
9059    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
9060%if ARCH_X86_32
9061    mov                  r1, r1m ; restore bh
9062%endif
9063    lea       reg_bottomext, [yq+bhq]
9064    sub       reg_bottomext, ihq
9065    lea                  r3, [bhq-1]
9066    cmovs     reg_bottomext, reg_zero
9067    ;
9068
9069    DEFINE_ARGS bw, bh, iw, ih, x, \
9070                topext, dst, dstride, src, sstride, \
9071                bottomext, rightext, blk
9072
9073    ; top_ext = iclip(-y, 0, bh - 1)
9074    neg             topextq
9075    cmovs           topextq, reg_zero
9076    cmp       reg_bottomext, bhq
9077    cmovns    reg_bottomext, r3
9078    cmp             topextq, bhq
9079    cmovg           topextq, r3
9080 %if ARCH_X86_32
9081    mov                 r4m, reg_bottomext
9082    ;
9083    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
9084    mov                  r0, r0m ; restore bw
9085 %endif
9086    lea        reg_rightext, [xq+bwq]
9087    sub        reg_rightext, iwq
9088    lea                  r2, [bwq-1]
9089    cmovs      reg_rightext, reg_zero
9090
9091    DEFINE_ARGS bw, bh, iw, ih, leftext, \
9092                topext, dst, dstride, src, sstride, \
9093                bottomext, rightext, blk
9094
9095    ; left_ext = iclip(-x, 0, bw - 1)
9096    neg            leftextq
9097    cmovs          leftextq, reg_zero
9098    cmp        reg_rightext, bwq
9099    cmovns     reg_rightext, r2
9100 %if ARCH_X86_32
9101    mov                 r3m, r1
9102 %endif
9103    cmp            leftextq, bwq
9104    cmovns         leftextq, r2
9105
9106%undef reg_zero
9107%undef reg_tmp
9108%undef reg_src
9109%undef reg_bottomext
9110%undef reg_rightext
9111
9112    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
9113                topext, dst, dstride, src, sstride, \
9114                bottomext, rightext, blk
9115
9116    ; center_h = bh - top_ext - bottom_ext
9117%if ARCH_X86_64
9118    lea                  r3, [bottomextq+topextq]
9119    sub            centerhq, r3
9120%else
9121    mov                   r1, centerhm ; restore r1
9122    sub             centerhq, topextq
9123    sub             centerhq, r4m
9124    mov                  r1m, centerhq
9125%endif
9126    ;
9127    ; blk += top_ext * PXSTRIDE(dst_stride)
9128    mov                  r2, topextq
9129%if ARCH_X86_64
9130    imul                 r2, dstrideq
9131%else
9132    mov                  r6, r6m ; restore dstq
9133    imul                 r2, dstridem
9134%endif
9135    add                dstq, r2
9136    mov            reg_blkm, dstq ; save pointer for ext
9137    ;
9138    ; center_w = bw - left_ext - right_ext
9139    mov            centerwq, bwq
9140%if ARCH_X86_64
9141    lea                  r3, [rightextq+leftextq]
9142    sub            centerwq, r3
9143%else
9144    sub            centerwq, r3m
9145    sub            centerwq, leftextq
9146%endif
9147
9148; vloop Macro
9149%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
9150  %if ARCH_X86_64
9151    %define reg_tmp        r12
9152  %else
9153    %define reg_tmp        r0
9154  %endif
9155.v_loop_%3:
9156  %if ARCH_X86_32
9157    mov                  r0, r0m
9158    mov                  r1, r1m
9159  %endif
9160%if %1
9161    ; left extension
9162  %if ARCH_X86_64
9163    movd                 m0, [srcq]
9164  %else
9165    mov                  r3, srcm
9166    movd                 m0, [r3]
9167  %endif
9168    pshufb               m0, m1
9169    xor                  r3, r3
9170.left_loop_%3:
9171    mova          [dstq+r3], m0
9172    add                  r3, mmsize
9173    cmp                  r3, leftextq
9174    jl .left_loop_%3
9175    ; body
9176    lea             reg_tmp, [dstq+leftextq]
9177%endif
9178    xor                  r3, r3
9179.body_loop_%3:
9180  %if ARCH_X86_64
9181    movu                 m0, [srcq+r3]
9182  %else
9183    mov                  r1, srcm
9184    movu                 m0, [r1+r3]
9185  %endif
9186%if %1
9187    movu       [reg_tmp+r3], m0
9188%else
9189    movu          [dstq+r3], m0
9190%endif
9191    add                  r3, mmsize
9192    cmp                  r3, centerwq
9193    jl .body_loop_%3
9194%if %2
9195    ; right extension
9196%if %1
9197    add             reg_tmp, centerwq
9198%else
9199    lea             reg_tmp, [dstq+centerwq]
9200%endif
9201  %if ARCH_X86_64
9202    movd                 m0, [srcq+centerwq-1]
9203  %else
9204    mov                  r3, srcm
9205    movd                 m0, [r3+centerwq-1]
9206  %endif
9207    pshufb               m0, m1
9208    xor                  r3, r3
9209.right_loop_%3:
9210    movu       [reg_tmp+r3], m0
9211    add                  r3, mmsize
9212  %if ARCH_X86_64
9213    cmp                  r3, rightextq
9214  %else
9215    cmp                  r3, r3m
9216  %endif
9217    jl .right_loop_%3
9218%endif
9219  %if ARCH_X86_64
9220    add                dstq, dstrideq
9221    add                srcq, sstrideq
9222    dec            centerhq
9223    jg .v_loop_%3
9224  %else
9225    add                dstq, dstridem
9226    mov                  r0, sstridem
9227    add                srcm, r0
9228    sub       dword centerhm, 1
9229    jg .v_loop_%3
9230    mov                  r0, r0m ; restore r0
9231  %endif
9232%endmacro ; vloop MACRO
9233
9234    test           leftextq, leftextq
9235    jnz .need_left_ext
9236 %if ARCH_X86_64
9237    test          rightextq, rightextq
9238    jnz .need_right_ext
9239 %else
9240    cmp            leftextq, r3m ; leftextq == 0
9241    jne .need_right_ext
9242 %endif
9243    v_loop                0, 0, 0
9244    jmp .body_done
9245
9246    ;left right extensions
9247.need_left_ext:
9248 %if ARCH_X86_64
9249    test          rightextq, rightextq
9250 %else
9251    mov                  r3, r3m
9252    test                 r3, r3
9253 %endif
9254    jnz .need_left_right_ext
9255    v_loop                1, 0, 1
9256    jmp .body_done
9257
9258.need_left_right_ext:
9259    v_loop                1, 1, 2
9260    jmp .body_done
9261
9262.need_right_ext:
9263    v_loop                0, 1, 3
9264
9265.body_done:
9266; r0 ; bw
9267; r1 ;; x loop
9268; r4 ;; y loop
9269; r5 ; topextq
9270; r6 ;dstq
9271; r7 ;dstrideq
9272; r8 ; srcq
9273%if ARCH_X86_64
9274 %define reg_dstride    dstrideq
9275%else
9276 %define reg_dstride    r2
9277%endif
9278    ;
9279    ; bottom edge extension
9280 %if ARCH_X86_64
9281    test         bottomextq, bottomextq
9282    jz .top
9283 %else
9284    xor                  r1, r1
9285    cmp                  r1, r4m
9286    je .top
9287 %endif
9288    ;
9289 %if ARCH_X86_64
9290    mov                srcq, dstq
9291    sub                srcq, dstrideq
9292    xor                  r1, r1
9293 %else
9294    mov                  r3, dstq
9295    mov         reg_dstride, dstridem
9296    sub                  r3, reg_dstride
9297    mov                srcm, r3
9298 %endif
9299    ;
9300.bottom_x_loop:
9301 %if ARCH_X86_64
9302    mova                 m0, [srcq+r1]
9303    lea                  r3, [dstq+r1]
9304    mov                  r4, bottomextq
9305 %else
9306    mov                  r3, srcm
9307    mova                 m0, [r3+r1]
9308    lea                  r3, [dstq+r1]
9309    mov                  r4, r4m
9310 %endif
9311    ;
9312.bottom_y_loop:
9313    mova               [r3], m0
9314    add                  r3, reg_dstride
9315    dec                  r4
9316    jg .bottom_y_loop
9317    add                  r1, mmsize
9318    cmp                  r1, bwq
9319    jl .bottom_x_loop
9320
9321.top:
9322    ; top edge extension
9323    test            topextq, topextq
9324    jz .end
9325%if ARCH_X86_64
9326    mov                srcq, reg_blkm
9327%else
9328    mov                  r3, reg_blkm
9329    mov         reg_dstride, dstridem
9330%endif
9331    mov                dstq, dstm
9332    xor                  r1, r1
9333    ;
9334.top_x_loop:
9335%if ARCH_X86_64
9336    mova                 m0, [srcq+r1]
9337%else
9338    mov                  r3, reg_blkm
9339    mova                 m0, [r3+r1]
9340%endif
9341    lea                  r3, [dstq+r1]
9342    mov                  r4, topextq
9343    ;
9344.top_y_loop:
9345    mova               [r3], m0
9346    add                  r3, reg_dstride
9347    dec                  r4
9348    jg .top_y_loop
9349    add                  r1, mmsize
9350    cmp                  r1, bwq
9351    jl .top_x_loop
9352
9353.end:
9354    RET
9355
9356%undef reg_dstride
9357%undef reg_blkm
9358%undef reg_tmp
9359
9360cextern resize_filter
9361
9362%macro SCRATCH 3
9363%if ARCH_X86_32
9364    mova [rsp+%3*mmsize], m%1
9365%define m%2 [rsp+%3*mmsize]
9366%else
9367    SWAP             %1, %2
9368%endif
9369%endmacro
9370
9371%if ARCH_X86_64
9372cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \
9373                                dst_w, h, src_w, dx, mx0
9374%elif STACK_ALIGNMENT >= 16
9375cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
9376                                      dst_w, h, src_w, dx, mx0
9377%else
9378cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
9379                                      dst_w, h, src_w, dx, mx0
9380%endif
9381    movifnidn          dstq, dstmp
9382    movifnidn          srcq, srcmp
9383%if STACK_ALIGNMENT >= 16
9384    movifnidn        dst_wd, dst_wm
9385%endif
9386%if ARCH_X86_64
9387    movifnidn            hd, hm
9388%endif
9389    sub          dword mx0m, 4<<14
9390    sub        dword src_wm, 8
9391    movd                 m7, dxm
9392    movd                 m6, mx0m
9393    movd                 m5, src_wm
9394    pshufd               m7, m7, q0000
9395    pshufd               m6, m6, q0000
9396    pshufd               m5, m5, q0000
9397
9398%if ARCH_X86_64
9399    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
9400    LEA                  r7, $$
9401%define base r7-$$
9402%else
9403    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
9404%define hd dword r5m
9405%if STACK_ALIGNMENT >= 16
9406    LEA                  r6, $$
9407%define base r6-$$
9408%else
9409    LEA                  r4, $$
9410%define base r4-$$
9411%endif
9412%endif
9413
9414%if ARCH_X86_64
9415    mova                m10, [base+pw_m256]
9416    mova                 m9, [base+pd_63]
9417    mova                 m8, [base+pb_8x0_8x8]
9418%else
9419%define m10 [base+pw_m256]
9420%define m9  [base+pd_63]
9421%define m8  [base+pb_8x0_8x8]
9422%endif
9423    pmaddwd              m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
9424    pslld                m7, 2                      ; dx*4
9425    pslld                m5, 14
9426    paddd                m6, m4                     ; mx+[0..3]*dx
9427    SCRATCH               7, 13, 0
9428    SCRATCH               6, 12, 1
9429    SCRATCH               5, 11, 2
9430
9431    ; m10 = pmulhrsw constant for x=(x+64)>>7
9432    ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8
9433
9434.loop_y:
9435    xor                  xd, xd
9436    mova                 m0, m12                    ; per-line working version of mx
9437
9438.loop_x:
9439    pxor                 m1, m1
9440    pcmpgtd              m1, m0
9441    pandn                m1, m0
9442    psrad                m2, m0, 8                  ; filter offset (unmasked)
9443    pcmpgtd              m3, m11, m1
9444    pand                 m1, m3
9445    pandn                m3, m11
9446    por                  m1, m3
9447    psubd                m3, m0, m1                 ; pshufb offset
9448    psrad                m1, 14                     ; clipped src_x offset
9449    psrad                m3, 14                     ; pshufb edge_emu offset
9450    pand                 m2, m9                     ; filter offset (masked)
9451
9452    ; load source pixels
9453%if ARCH_X86_64
9454    movd                r8d, m1
9455    pshuflw              m1, m1, q3232
9456    movd                r9d, m1
9457    punpckhqdq           m1, m1
9458    movd               r10d, m1
9459    psrlq                m1, 32
9460    movd               r11d, m1
9461    movq                 m4, [srcq+r8]
9462    movq                 m5, [srcq+r10]
9463    movhps               m4, [srcq+r9]
9464    movhps               m5, [srcq+r11]
9465%else
9466    movd                r3d,  m1
9467    pshufd               m1,  m1, q3312
9468    movd                r1d,  m1
9469    pshuflw              m1,  m1, q3232
9470    movq                 m4, [srcq+r3]
9471    movq                 m5, [srcq+r1]
9472    movd                r3d,  m1
9473    punpckhqdq           m1,  m1
9474    movd                r1d,  m1
9475    movhps               m4, [srcq+r3]
9476    movhps               m5, [srcq+r1]
9477%endif
9478
9479    ; if no emulation is required, we don't need to shuffle or emulate edges
9480    ; this also saves 2 quasi-vpgatherdqs
9481    pxor                 m6, m6
9482    pcmpeqb              m6, m3
9483%if ARCH_X86_64
9484    pmovmskb            r8d, m6
9485    cmp                 r8d, 0xffff
9486%else
9487    pmovmskb            r3d, m6
9488    cmp                 r3d, 0xffff
9489%endif
9490    je .filter
9491
9492%if ARCH_X86_64
9493    movd                r8d, m3
9494    pshuflw              m3, m3, q3232
9495    movd                r9d, m3
9496    punpckhqdq           m3, m3
9497    movd               r10d, m3
9498    psrlq                m3, 32
9499    movd               r11d, m3
9500    movsxd               r8, r8d
9501    movsxd               r9, r9d
9502    movsxd              r10, r10d
9503    movsxd              r11, r11d
9504    movq                 m6, [base+resize_shuf+4+r8]
9505    movq                 m7, [base+resize_shuf+4+r10]
9506    movhps               m6, [base+resize_shuf+4+r9]
9507    movhps               m7, [base+resize_shuf+4+r11]
9508%else
9509    movd                r3d, m3
9510    pshufd               m3, m3, q3312
9511    movd                r1d, m3
9512    pshuflw              m3, m3, q3232
9513    movq                 m6, [base+resize_shuf+4+r3]
9514    movq                 m7, [base+resize_shuf+4+r1]
9515    movd                r3d, m3
9516    punpckhqdq           m3, m3
9517    movd                r1d, m3
9518    movhps               m6, [base+resize_shuf+4+r3]
9519    movhps               m7, [base+resize_shuf+4+r1]
9520%endif
9521
9522    paddb                m6, m8
9523    paddb                m7, m8
9524    pshufb               m4, m6
9525    pshufb               m5, m7
9526
9527.filter:
9528%if ARCH_X86_64
9529    movd                r8d, m2
9530    pshuflw              m2, m2, q3232
9531    movd                r9d, m2
9532    punpckhqdq           m2, m2
9533    movd               r10d, m2
9534    psrlq                m2, 32
9535    movd               r11d, m2
9536    movq                 m6, [base+resize_filter+r8*8]
9537    movq                 m7, [base+resize_filter+r10*8]
9538    movhps               m6, [base+resize_filter+r9*8]
9539    movhps               m7, [base+resize_filter+r11*8]
9540%else
9541    movd                r3d, m2
9542    pshufd               m2, m2, q3312
9543    movd                r1d, m2
9544    pshuflw              m2, m2, q3232
9545    movq                 m6, [base+resize_filter+r3*8]
9546    movq                 m7, [base+resize_filter+r1*8]
9547    movd                r3d, m2
9548    punpckhqdq           m2, m2
9549    movd                r1d, m2
9550    movhps               m6, [base+resize_filter+r3*8]
9551    movhps               m7, [base+resize_filter+r1*8]
9552%endif
9553
9554    pmaddubsw            m4, m6
9555    pmaddubsw            m5, m7
9556    phaddw               m4, m5
9557    phaddsw              m4, m4
9558    pmulhrsw             m4, m10                    ; x=(x+64)>>7
9559    packuswb             m4, m4
9560    movd          [dstq+xq], m4
9561
9562    paddd                m0, m13
9563    add                  xd, 4
9564%if STACK_ALIGNMENT >= 16
9565    cmp                  xd, dst_wd
9566%else
9567    cmp                  xd, dst_wm
9568%endif
9569    jl .loop_x
9570
9571    add                dstq, dst_stridemp
9572    add                srcq, src_stridemp
9573    dec                  hd
9574    jg .loop_y
9575    RET
9576
9577INIT_XMM ssse3
9578PREP_BILIN
9579PREP_8TAP
9580WARP_AFFINE_8X8
9581WARP_AFFINE_8X8T
9582
9583INIT_XMM sse4
9584WARP_AFFINE_8X8
9585WARP_AFFINE_8X8T
9586
9587INIT_XMM sse2
9588PREP_BILIN
9589PREP_8TAP
9590WARP_AFFINE_8X8
9591WARP_AFFINE_8X8T
9592