1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18%macro convolve_fn 1-2 19%ifidn %1, avg 20%define AUX_XMM_REGS 4 21%else 22%define AUX_XMM_REGS 0 23%endif 24%ifidn %2, highbd 25%define pavg pavgw 26cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ 27 dst, dst_stride, \ 28 fx, fxs, fy, fys, w, h, bd 29%else 30%define pavg pavgb 31cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ 32 dst, dst_stride, \ 33 fx, fxs, fy, fys, w, h 34%endif 35 mov r4d, dword wm 36%ifidn %2, highbd 37 shl r4d, 1 38 shl srcq, 1 39 shl src_strideq, 1 40 shl dstq, 1 41 shl dst_strideq, 1 42%else 43 cmp r4d, 4 44 je .w4 45%endif 46 cmp r4d, 8 47 je .w8 48 cmp r4d, 16 49 je .w16 50 cmp r4d, 32 51 je .w32 52 53 cmp r4d, 64 54 je .w64 55%ifidn %2, highbd 56 cmp r4d, 128 57 je .w128 58 59.w256: 60 mov r4d, dword hm 61.loop256: 62 movu m0, [srcq] 63 movu m1, [srcq+16] 64 movu m2, [srcq+32] 65 movu m3, [srcq+48] 66%ifidn %1, avg 67 pavg m0, [dstq] 68 pavg m1, [dstq+16] 69 pavg m2, [dstq+32] 70 pavg m3, [dstq+48] 71%endif 72 mova [dstq ], m0 73 mova [dstq+16], m1 74 mova [dstq+32], m2 75 mova [dstq+48], m3 76 movu m0, [srcq+64] 77 movu m1, [srcq+80] 78 movu m2, [srcq+96] 79 movu m3, [srcq+112] 80%ifidn %1, avg 81 pavg m0, [dstq+64] 82 pavg m1, [dstq+80] 83 pavg m2, [dstq+96] 84 pavg m3, [dstq+112] 85%endif 86 mova [dstq+64], m0 87 mova [dstq+80], m1 88 mova [dstq+96], m2 89 mova [dstq+112], m3 90 movu m0, [srcq+128] 91 movu m1, [srcq+128+16] 92 movu m2, [srcq+128+32] 93 movu m3, [srcq+128+48] 94%ifidn %1, avg 95 pavg m0, [dstq+128] 96 pavg m1, [dstq+128+16] 97 pavg m2, [dstq+128+32] 98 pavg m3, [dstq+128+48] 99%endif 100 mova [dstq+128 ], m0 101 mova [dstq+128+16], m1 102 mova [dstq+128+32], m2 103 mova [dstq+128+48], m3 104 movu m0, [srcq+128+64] 105 movu m1, [srcq+128+80] 106 movu m2, [srcq+128+96] 107 movu m3, [srcq+128+112] 108 add srcq, src_strideq 109%ifidn %1, avg 110 pavg m0, [dstq+128+64] 111 pavg m1, [dstq+128+80] 112 pavg m2, [dstq+128+96] 113 pavg m3, [dstq+128+112] 114%endif 115 mova [dstq+128+64], m0 116 mova [dstq+128+80], m1 117 mova [dstq+128+96], m2 118 mova [dstq+128+112], m3 119 add dstq, dst_strideq 120 sub r4d, 1 121 jnz .loop256 122 RET 123%endif 124 125.w128: 126 mov r4d, dword hm 127.loop128: 128 movu m0, [srcq] 129 movu m1, [srcq+16] 130 movu m2, [srcq+32] 131 movu m3, [srcq+48] 132%ifidn %1, avg 133 pavg m0, [dstq] 134 pavg m1, [dstq+16] 135 pavg m2, [dstq+32] 136 pavg m3, [dstq+48] 137%endif 138 mova [dstq ], m0 139 mova [dstq+16], m1 140 mova [dstq+32], m2 141 mova [dstq+48], m3 142 movu m0, [srcq+64] 143 movu m1, [srcq+80] 144 movu m2, [srcq+96] 145 movu m3, [srcq+112] 146 add srcq, src_strideq 147%ifidn %1, avg 148 pavg m0, [dstq+64] 149 pavg m1, [dstq+80] 150 pavg m2, [dstq+96] 151 pavg m3, [dstq+112] 152%endif 153 mova [dstq+64], m0 154 mova [dstq+80], m1 155 mova [dstq+96], m2 156 mova [dstq+112], m3 157 add dstq, dst_strideq 158 sub r4d, 1 159 jnz .loop128 160 RET 161 162.w64: 163 mov r4d, dword hm 164.loop64: 165 movu m0, [srcq] 166 movu m1, [srcq+16] 167 movu m2, [srcq+32] 168 movu m3, [srcq+48] 169 add srcq, src_strideq 170%ifidn %1, avg 171 pavg m0, [dstq] 172 pavg m1, [dstq+16] 173 pavg m2, [dstq+32] 174 pavg m3, [dstq+48] 175%endif 176 mova [dstq ], m0 177 mova [dstq+16], m1 178 mova [dstq+32], m2 179 mova [dstq+48], m3 180 add dstq, dst_strideq 181 sub r4d, 1 182 jnz .loop64 183 RET 184 185.w32: 186 mov r4d, dword hm 187.loop32: 188 movu m0, [srcq] 189 movu m1, [srcq+16] 190 movu m2, [srcq+src_strideq] 191 movu m3, [srcq+src_strideq+16] 192 lea srcq, [srcq+src_strideq*2] 193%ifidn %1, avg 194 pavg m0, [dstq] 195 pavg m1, [dstq +16] 196 pavg m2, [dstq+dst_strideq] 197 pavg m3, [dstq+dst_strideq+16] 198%endif 199 mova [dstq ], m0 200 mova [dstq +16], m1 201 mova [dstq+dst_strideq ], m2 202 mova [dstq+dst_strideq+16], m3 203 lea dstq, [dstq+dst_strideq*2] 204 sub r4d, 2 205 jnz .loop32 206 RET 207 208.w16: 209 mov r4d, dword hm 210 lea r5q, [src_strideq*3] 211 lea r6q, [dst_strideq*3] 212.loop16: 213 movu m0, [srcq] 214 movu m1, [srcq+src_strideq] 215 movu m2, [srcq+src_strideq*2] 216 movu m3, [srcq+r5q] 217 lea srcq, [srcq+src_strideq*4] 218%ifidn %1, avg 219 pavg m0, [dstq] 220 pavg m1, [dstq+dst_strideq] 221 pavg m2, [dstq+dst_strideq*2] 222 pavg m3, [dstq+r6q] 223%endif 224 mova [dstq ], m0 225 mova [dstq+dst_strideq ], m1 226 mova [dstq+dst_strideq*2], m2 227 mova [dstq+r6q ], m3 228 lea dstq, [dstq+dst_strideq*4] 229 sub r4d, 4 230 jnz .loop16 231 RET 232 233.w8: 234 mov r4d, dword hm 235 lea r5q, [src_strideq*3] 236 lea r6q, [dst_strideq*3] 237.loop8: 238 movh m0, [srcq] 239 movh m1, [srcq+src_strideq] 240 movh m2, [srcq+src_strideq*2] 241 movh m3, [srcq+r5q] 242 lea srcq, [srcq+src_strideq*4] 243%ifidn %1, avg 244 movh m4, [dstq] 245 movh m5, [dstq+dst_strideq] 246 movh m6, [dstq+dst_strideq*2] 247 movh m7, [dstq+r6q] 248 pavg m0, m4 249 pavg m1, m5 250 pavg m2, m6 251 pavg m3, m7 252%endif 253 movh [dstq ], m0 254 movh [dstq+dst_strideq ], m1 255 movh [dstq+dst_strideq*2], m2 256 movh [dstq+r6q ], m3 257 lea dstq, [dstq+dst_strideq*4] 258 sub r4d, 4 259 jnz .loop8 260 RET 261 262%ifnidn %2, highbd 263.w4: 264 mov r4d, dword hm 265 lea r5q, [src_strideq*3] 266 lea r6q, [dst_strideq*3] 267.loop4: 268 movd m0, [srcq] 269 movd m1, [srcq+src_strideq] 270 movd m2, [srcq+src_strideq*2] 271 movd m3, [srcq+r5q] 272 lea srcq, [srcq+src_strideq*4] 273%ifidn %1, avg 274 movd m4, [dstq] 275 movd m5, [dstq+dst_strideq] 276 movd m6, [dstq+dst_strideq*2] 277 movd m7, [dstq+r6q] 278 pavg m0, m4 279 pavg m1, m5 280 pavg m2, m6 281 pavg m3, m7 282%endif 283 movd [dstq ], m0 284 movd [dstq+dst_strideq ], m1 285 movd [dstq+dst_strideq*2], m2 286 movd [dstq+r6q ], m3 287 lea dstq, [dstq+dst_strideq*4] 288 sub r4d, 4 289 jnz .loop4 290 RET 291%endif 292%endmacro 293 294INIT_XMM sse2 295convolve_fn copy 296convolve_fn avg 297convolve_fn copy, highbd 298