1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION_RODATA 17pw_4: times 8 dw 4 18pw_8: times 8 dw 8 19pw_16: times 4 dd 16 20pw_32: times 4 dd 32 21 22SECTION .text 23INIT_XMM sse2 24cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset 25 GET_GOT goffsetq 26 27 movq m0, [aboveq] 28 movq m2, [leftq] 29 paddw m0, m2 30 pshuflw m1, m0, 0xe 31 paddw m0, m1 32 pshuflw m1, m0, 0x1 33 paddw m0, m1 34 paddw m0, [GLOBAL(pw_4)] 35 psraw m0, 3 36 pshuflw m0, m0, 0x0 37 movq [dstq ], m0 38 movq [dstq+strideq*2], m0 39 lea dstq, [dstq+strideq*4] 40 movq [dstq ], m0 41 movq [dstq+strideq*2], m0 42 43 RESTORE_GOT 44 RET 45 46INIT_XMM sse2 47cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset 48 GET_GOT goffsetq 49 50 pxor m1, m1 51 mova m0, [aboveq] 52 mova m2, [leftq] 53 DEFINE_ARGS dst, stride, stride3, one 54 mov oned, 0x00010001 55 lea stride3q, [strideq*3] 56 movd m3, oned 57 pshufd m3, m3, 0x0 58 paddw m0, m2 59 pmaddwd m0, m3 60 packssdw m0, m1 61 pmaddwd m0, m3 62 packssdw m0, m1 63 pmaddwd m0, m3 64 paddw m0, [GLOBAL(pw_8)] 65 psrlw m0, 4 66 pshuflw m0, m0, 0x0 67 punpcklqdq m0, m0 68 mova [dstq ], m0 69 mova [dstq+strideq*2 ], m0 70 mova [dstq+strideq*4 ], m0 71 mova [dstq+stride3q*2], m0 72 lea dstq, [dstq+strideq*8] 73 mova [dstq ], m0 74 mova [dstq+strideq*2 ], m0 75 mova [dstq+strideq*4 ], m0 76 mova [dstq+stride3q*2], m0 77 78 RESTORE_GOT 79 RET 80 81INIT_XMM sse2 82cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset 83 GET_GOT goffsetq 84 85 pxor m1, m1 86 mova m0, [aboveq] 87 mova m3, [aboveq+16] 88 mova m2, [leftq] 89 mova m4, [leftq+16] 90 DEFINE_ARGS dst, stride, stride3, lines4 91 lea stride3q, [strideq*3] 92 mov lines4d, 4 93 paddw m0, m2 94 paddw m0, m3 95 paddw m0, m4 96 movhlps m2, m0 97 paddw m0, m2 98 punpcklwd m0, m1 99 movhlps m2, m0 100 paddd m0, m2 101 punpckldq m0, m1 102 movhlps m2, m0 103 paddd m0, m2 104 paddd m0, [GLOBAL(pw_16)] 105 psrad m0, 5 106 pshuflw m0, m0, 0x0 107 punpcklqdq m0, m0 108.loop: 109 mova [dstq ], m0 110 mova [dstq +16], m0 111 mova [dstq+strideq*2 ], m0 112 mova [dstq+strideq*2 +16], m0 113 mova [dstq+strideq*4 ], m0 114 mova [dstq+strideq*4 +16], m0 115 mova [dstq+stride3q*2 ], m0 116 mova [dstq+stride3q*2+16], m0 117 lea dstq, [dstq+strideq*8] 118 dec lines4d 119 jnz .loop 120 121 RESTORE_GOT 122 REP_RET 123 124INIT_XMM sse2 125cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset 126 GET_GOT goffsetq 127 128 mova m0, [aboveq] 129 mova m2, [aboveq+16] 130 mova m3, [aboveq+32] 131 mova m4, [aboveq+48] 132 paddw m0, m2 133 paddw m3, m4 134 mova m2, [leftq] 135 mova m4, [leftq+16] 136 mova m5, [leftq+32] 137 mova m6, [leftq+48] 138 paddw m2, m4 139 paddw m5, m6 140 paddw m0, m3 141 paddw m2, m5 142 pxor m1, m1 143 paddw m0, m2 144 DEFINE_ARGS dst, stride, stride3, lines4 145 lea stride3q, [strideq*3] 146 mov lines4d, 8 147 movhlps m2, m0 148 paddw m0, m2 149 punpcklwd m0, m1 150 movhlps m2, m0 151 paddd m0, m2 152 punpckldq m0, m1 153 movhlps m2, m0 154 paddd m0, m2 155 paddd m0, [GLOBAL(pw_32)] 156 psrad m0, 6 157 pshuflw m0, m0, 0x0 158 punpcklqdq m0, m0 159.loop: 160 mova [dstq ], m0 161 mova [dstq +16 ], m0 162 mova [dstq +32 ], m0 163 mova [dstq +48 ], m0 164 mova [dstq+strideq*2 ], m0 165 mova [dstq+strideq*2+16 ], m0 166 mova [dstq+strideq*2+32 ], m0 167 mova [dstq+strideq*2+48 ], m0 168 mova [dstq+strideq*4 ], m0 169 mova [dstq+strideq*4+16 ], m0 170 mova [dstq+strideq*4+32 ], m0 171 mova [dstq+strideq*4+48 ], m0 172 mova [dstq+stride3q*2 ], m0 173 mova [dstq+stride3q*2 +16], m0 174 mova [dstq+stride3q*2 +32], m0 175 mova [dstq+stride3q*2 +48], m0 176 lea dstq, [dstq+strideq*8] 177 dec lines4d 178 jnz .loop 179 180 RESTORE_GOT 181 REP_RET 182 183INIT_XMM sse2 184cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above 185 movq m0, [aboveq] 186 movq [dstq ], m0 187 movq [dstq+strideq*2], m0 188 lea dstq, [dstq+strideq*4] 189 movq [dstq ], m0 190 movq [dstq+strideq*2], m0 191 RET 192 193INIT_XMM sse2 194cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above 195 mova m0, [aboveq] 196 DEFINE_ARGS dst, stride, stride3 197 lea stride3q, [strideq*3] 198 mova [dstq ], m0 199 mova [dstq+strideq*2 ], m0 200 mova [dstq+strideq*4 ], m0 201 mova [dstq+stride3q*2], m0 202 lea dstq, [dstq+strideq*8] 203 mova [dstq ], m0 204 mova [dstq+strideq*2 ], m0 205 mova [dstq+strideq*4 ], m0 206 mova [dstq+stride3q*2], m0 207 RET 208 209INIT_XMM sse2 210cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above 211 mova m0, [aboveq] 212 mova m1, [aboveq+16] 213 DEFINE_ARGS dst, stride, stride3, nlines4 214 lea stride3q, [strideq*3] 215 mov nlines4d, 4 216.loop: 217 mova [dstq ], m0 218 mova [dstq +16], m1 219 mova [dstq+strideq*2 ], m0 220 mova [dstq+strideq*2 +16], m1 221 mova [dstq+strideq*4 ], m0 222 mova [dstq+strideq*4 +16], m1 223 mova [dstq+stride3q*2 ], m0 224 mova [dstq+stride3q*2+16], m1 225 lea dstq, [dstq+strideq*8] 226 dec nlines4d 227 jnz .loop 228 REP_RET 229 230INIT_XMM sse2 231cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above 232 mova m0, [aboveq] 233 mova m1, [aboveq+16] 234 mova m2, [aboveq+32] 235 mova m3, [aboveq+48] 236 DEFINE_ARGS dst, stride, stride3, nlines4 237 lea stride3q, [strideq*3] 238 mov nlines4d, 8 239.loop: 240 mova [dstq ], m0 241 mova [dstq +16], m1 242 mova [dstq +32], m2 243 mova [dstq +48], m3 244 mova [dstq+strideq*2 ], m0 245 mova [dstq+strideq*2 +16], m1 246 mova [dstq+strideq*2 +32], m2 247 mova [dstq+strideq*2 +48], m3 248 mova [dstq+strideq*4 ], m0 249 mova [dstq+strideq*4 +16], m1 250 mova [dstq+strideq*4 +32], m2 251 mova [dstq+strideq*4 +48], m3 252 mova [dstq+stride3q*2 ], m0 253 mova [dstq+stride3q*2 +16], m1 254 mova [dstq+stride3q*2 +32], m2 255 mova [dstq+stride3q*2 +48], m3 256 lea dstq, [dstq+strideq*8] 257 dec nlines4d 258 jnz .loop 259 REP_RET 260