1 /* 2 * Copyright (c) 2019 Shiyou Yin (yinshiyou-hf@loongson.cn) 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include "libavcodec/hevcdec.h" 22 #include "libavcodec/bit_depth_template.c" 23 #include "libavcodec/mips/hevcdsp_mips.h" 24 #include "libavutil/mips/mmiutils.h" 25 26 #define PUT_HEVC_QPEL_H(w, x_step, src_step, dst_step) \ 27 void ff_hevc_put_hevc_qpel_h##w##_8_mmi(int16_t *dst, uint8_t *_src, \ 28 ptrdiff_t _srcstride, \ 29 int height, intptr_t mx, \ 30 intptr_t my, int width) \ 31 { \ 32 int x, y; \ 33 pixel *src = (pixel*)_src - 3; \ 34 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 35 uint64_t ftmp[15]; \ 36 uint64_t rtmp[1]; \ 37 const int8_t *filter = ff_hevc_qpel_filters[mx - 1]; \ 38 \ 39 x = x_step; \ 40 y = height; \ 41 __asm__ volatile( \ 42 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 43 "li %[rtmp0], 0x08 \n\t" \ 44 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 45 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 46 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 47 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 48 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 49 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 50 \ 51 "1: \n\t" \ 52 "2: \n\t" \ 53 "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" \ 54 "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" \ 55 "gsldlc1 %[ftmp4], 0x08(%[src]) \n\t" \ 56 "gsldrc1 %[ftmp4], 0x01(%[src]) \n\t" \ 57 "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" \ 58 "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" \ 59 "gsldlc1 %[ftmp6], 0x0a(%[src]) \n\t" \ 60 "gsldrc1 %[ftmp6], 0x03(%[src]) \n\t" \ 61 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 62 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 63 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 64 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 65 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \ 66 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 67 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 68 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 69 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 70 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \ 71 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 72 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 73 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 74 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 75 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \ 76 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 77 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 78 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 79 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 80 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \ 81 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 82 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 83 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 84 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 85 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 86 "gssdlc1 %[ftmp3], 0x07(%[dst]) \n\t" \ 87 "gssdrc1 %[ftmp3], 0x00(%[dst]) \n\t" \ 88 \ 89 "daddi %[x], %[x], -0x01 \n\t" \ 90 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 91 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t" \ 92 "bnez %[x], 2b \n\t" \ 93 \ 94 "daddi %[y], %[y], -0x01 \n\t" \ 95 "li %[x], " #x_step " \n\t" \ 96 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 97 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 98 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 99 PTR_ADDIU "%[dst], %[dst], 0x80 \n\t" \ 100 "bnez %[y], 1b \n\t" \ 101 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 102 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 103 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 104 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 105 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 106 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \ 107 [src]"+&r"(src), [dst]"+&r"(dst), [y]"+&r"(y), \ 108 [x]"+&r"(x) \ 109 : [filter]"r"(filter), [stride]"r"(srcstride) \ 110 : "memory" \ 111 ); \ 112 } 113 114 PUT_HEVC_QPEL_H(4, 1, -4, -8); 115 PUT_HEVC_QPEL_H(8, 2, -8, -16); 116 PUT_HEVC_QPEL_H(12, 3, -12, -24); 117 PUT_HEVC_QPEL_H(16, 4, -16, -32); 118 PUT_HEVC_QPEL_H(24, 6, -24, -48); 119 PUT_HEVC_QPEL_H(32, 8, -32, -64); 120 PUT_HEVC_QPEL_H(48, 12, -48, -96); 121 PUT_HEVC_QPEL_H(64, 16, -64, -128); 122 123 #define PUT_HEVC_QPEL_HV(w, x_step, src_step, dst_step) \ 124 void ff_hevc_put_hevc_qpel_hv##w##_8_mmi(int16_t *dst, uint8_t *_src, \ 125 ptrdiff_t _srcstride, \ 126 int height, intptr_t mx, \ 127 intptr_t my, int width) \ 128 { \ 129 int x, y; \ 130 const int8_t *filter; \ 131 pixel *src = (pixel*)_src; \ 132 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 133 int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; \ 134 int16_t *tmp = tmp_array; \ 135 uint64_t ftmp[15]; \ 136 uint64_t rtmp[1]; \ 137 \ 138 src -= (QPEL_EXTRA_BEFORE * srcstride + 3); \ 139 filter = ff_hevc_qpel_filters[mx - 1]; \ 140 x = x_step; \ 141 y = height + QPEL_EXTRA; \ 142 __asm__ volatile( \ 143 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 144 "li %[rtmp0], 0x08 \n\t" \ 145 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 146 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 147 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 148 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 149 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 150 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 151 \ 152 "1: \n\t" \ 153 "2: \n\t" \ 154 "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" \ 155 "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" \ 156 "gsldlc1 %[ftmp4], 0x08(%[src]) \n\t" \ 157 "gsldrc1 %[ftmp4], 0x01(%[src]) \n\t" \ 158 "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" \ 159 "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" \ 160 "gsldlc1 %[ftmp6], 0x0a(%[src]) \n\t" \ 161 "gsldrc1 %[ftmp6], 0x03(%[src]) \n\t" \ 162 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 163 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 164 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 165 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 166 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \ 167 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 168 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 169 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 170 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 171 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \ 172 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 173 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 174 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 175 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 176 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \ 177 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 178 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 179 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 180 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 181 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \ 182 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 183 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 184 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 185 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 186 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 187 "gssdlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \ 188 "gssdrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \ 189 \ 190 "daddi %[x], %[x], -0x01 \n\t" \ 191 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 192 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 193 "bnez %[x], 2b \n\t" \ 194 \ 195 "daddi %[y], %[y], -0x01 \n\t" \ 196 "li %[x], " #x_step " \n\t" \ 197 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 198 PTR_ADDIU "%[tmp], %[tmp], " #dst_step " \n\t" \ 199 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 200 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 201 "bnez %[y], 1b \n\t" \ 202 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 203 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 204 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 205 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 206 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 207 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \ 208 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \ 209 [x]"+&r"(x) \ 210 : [filter]"r"(filter), [stride]"r"(srcstride) \ 211 : "memory" \ 212 ); \ 213 \ 214 tmp = tmp_array + QPEL_EXTRA_BEFORE * 4 -12; \ 215 filter = ff_hevc_qpel_filters[my - 1]; \ 216 x = x_step; \ 217 y = height; \ 218 __asm__ volatile( \ 219 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 220 "li %[rtmp0], 0x08 \n\t" \ 221 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 222 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 223 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 224 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 225 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 226 "li %[rtmp0], 0x06 \n\t" \ 227 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 228 \ 229 "1: \n\t" \ 230 "2: \n\t" \ 231 "gsldlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \ 232 "gsldrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \ 233 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 234 "gsldlc1 %[ftmp4], 0x07(%[tmp]) \n\t" \ 235 "gsldrc1 %[ftmp4], 0x00(%[tmp]) \n\t" \ 236 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 237 "gsldlc1 %[ftmp5], 0x07(%[tmp]) \n\t" \ 238 "gsldrc1 %[ftmp5], 0x00(%[tmp]) \n\t" \ 239 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 240 "gsldlc1 %[ftmp6], 0x07(%[tmp]) \n\t" \ 241 "gsldrc1 %[ftmp6], 0x00(%[tmp]) \n\t" \ 242 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 243 "gsldlc1 %[ftmp7], 0x07(%[tmp]) \n\t" \ 244 "gsldrc1 %[ftmp7], 0x00(%[tmp]) \n\t" \ 245 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 246 "gsldlc1 %[ftmp8], 0x07(%[tmp]) \n\t" \ 247 "gsldrc1 %[ftmp8], 0x00(%[tmp]) \n\t" \ 248 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 249 "gsldlc1 %[ftmp9], 0x07(%[tmp]) \n\t" \ 250 "gsldrc1 %[ftmp9], 0x00(%[tmp]) \n\t" \ 251 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 252 "gsldlc1 %[ftmp10], 0x07(%[tmp]) \n\t" \ 253 "gsldrc1 %[ftmp10], 0x00(%[tmp]) \n\t" \ 254 PTR_ADDIU "%[tmp], %[tmp], -0x380 \n\t" \ 255 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 256 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 257 TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10], \ 258 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 259 "pmaddhw %[ftmp11], %[ftmp3], %[ftmp1] \n\t" \ 260 "pmaddhw %[ftmp12], %[ftmp7], %[ftmp2] \n\t" \ 261 "pmaddhw %[ftmp13], %[ftmp4], %[ftmp1] \n\t" \ 262 "pmaddhw %[ftmp14], %[ftmp8], %[ftmp2] \n\t" \ 263 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 264 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 265 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4]) \ 266 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 267 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 268 "pmaddhw %[ftmp11], %[ftmp5], %[ftmp1] \n\t" \ 269 "pmaddhw %[ftmp12], %[ftmp9], %[ftmp2] \n\t" \ 270 "pmaddhw %[ftmp13], %[ftmp6], %[ftmp1] \n\t" \ 271 "pmaddhw %[ftmp14], %[ftmp10], %[ftmp2] \n\t" \ 272 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 273 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 274 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6]) \ 275 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 276 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 277 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 278 "gssdlc1 %[ftmp3], 0x07(%[dst]) \n\t" \ 279 "gssdrc1 %[ftmp3], 0x00(%[dst]) \n\t" \ 280 \ 281 "daddi %[x], %[x], -0x01 \n\t" \ 282 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t" \ 283 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 284 "bnez %[x], 2b \n\t" \ 285 \ 286 "daddi %[y], %[y], -0x01 \n\t" \ 287 "li %[x], " #x_step " \n\t" \ 288 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 289 PTR_ADDIU "%[tmp], %[tmp], " #dst_step " \n\t" \ 290 PTR_ADDIU "%[dst], %[dst], 0x80 \n\t" \ 291 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 292 "bnez %[y], 1b \n\t" \ 293 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 294 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 295 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 296 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 297 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 298 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \ 299 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), \ 300 [ftmp14]"=&f"(ftmp[14]), [rtmp0]"=&r"(rtmp[0]), \ 301 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), \ 302 [x]"+&r"(x) \ 303 : [filter]"r"(filter), [stride]"r"(srcstride) \ 304 : "memory" \ 305 ); \ 306 } 307 308 PUT_HEVC_QPEL_HV(4, 1, -4, -8); 309 PUT_HEVC_QPEL_HV(8, 2, -8, -16); 310 PUT_HEVC_QPEL_HV(12, 3, -12, -24); 311 PUT_HEVC_QPEL_HV(16, 4, -16, -32); 312 PUT_HEVC_QPEL_HV(24, 6, -24, -48); 313 PUT_HEVC_QPEL_HV(32, 8, -32, -64); 314 PUT_HEVC_QPEL_HV(48, 12, -48, -96); 315 PUT_HEVC_QPEL_HV(64, 16, -64, -128); 316 317 #define PUT_HEVC_QPEL_BI_H(w, x_step, src_step, src2_step, dst_step) \ 318 void ff_hevc_put_hevc_qpel_bi_h##w##_8_mmi(uint8_t *_dst, \ 319 ptrdiff_t _dststride, \ 320 uint8_t *_src, \ 321 ptrdiff_t _srcstride, \ 322 int16_t *src2, int height, \ 323 intptr_t mx, intptr_t my, \ 324 int width) \ 325 { \ 326 int x, y; \ 327 pixel *src = (pixel*)_src - 3; \ 328 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 329 pixel *dst = (pixel *)_dst; \ 330 ptrdiff_t dststride = _dststride / sizeof(pixel); \ 331 const int8_t *filter = ff_hevc_qpel_filters[mx - 1]; \ 332 uint64_t ftmp[20]; \ 333 uint64_t rtmp[1]; \ 334 int shift = 7; \ 335 int offset = 64; \ 336 \ 337 x = width >> 2; \ 338 y = height; \ 339 __asm__ volatile( \ 340 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 341 "li %[rtmp0], 0x08 \n\t" \ 342 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 343 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 344 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 345 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 346 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 347 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 348 "punpcklhw %[offset], %[offset], %[offset] \n\t" \ 349 "punpcklwd %[offset], %[offset], %[offset] \n\t" \ 350 \ 351 "1: \n\t" \ 352 "li %[x], " #x_step " \n\t" \ 353 "2: \n\t" \ 354 "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" \ 355 "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" \ 356 "gsldlc1 %[ftmp4], 0x08(%[src]) \n\t" \ 357 "gsldrc1 %[ftmp4], 0x01(%[src]) \n\t" \ 358 "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" \ 359 "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" \ 360 "gsldlc1 %[ftmp6], 0x0a(%[src]) \n\t" \ 361 "gsldrc1 %[ftmp6], 0x03(%[src]) \n\t" \ 362 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 363 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 364 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 365 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 366 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \ 367 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 368 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 369 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 370 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 371 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \ 372 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 373 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 374 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 375 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 376 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \ 377 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 378 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 379 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 380 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 381 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \ 382 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 383 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 384 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 385 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 386 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 387 "paddh %[ftmp3], %[ftmp3], %[offset] \n\t" \ 388 "gsldlc1 %[ftmp4], 0x07(%[src2]) \n\t" \ 389 "gsldrc1 %[ftmp4], 0x00(%[src2]) \n\t" \ 390 "li %[rtmp0], 0x10 \n\t" \ 391 "dmtc1 %[rtmp0], %[ftmp8] \n\t" \ 392 "punpcklhw %[ftmp5], %[ftmp0], %[ftmp3] \n\t" \ 393 "punpckhhw %[ftmp6], %[ftmp0], %[ftmp3] \n\t" \ 394 "punpckhhw %[ftmp3], %[ftmp0], %[ftmp4] \n\t" \ 395 "punpcklhw %[ftmp4], %[ftmp0], %[ftmp4] \n\t" \ 396 "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t" \ 397 "psraw %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \ 398 "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t" \ 399 "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \ 400 "paddw %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 401 "paddw %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 402 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \ 403 "psraw %[ftmp6], %[ftmp6], %[shift] \n\t" \ 404 "packsswh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 405 "pcmpgth %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 406 "and %[ftmp3], %[ftmp5], %[ftmp7] \n\t" \ 407 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 408 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" \ 409 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" \ 410 \ 411 "daddi %[x], %[x], -0x01 \n\t" \ 412 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 413 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \ 414 PTR_ADDIU "%[src2], %[src2], 0x08 \n\t" \ 415 "bnez %[x], 2b \n\t" \ 416 \ 417 "daddi %[y], %[y], -0x01 \n\t" \ 418 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 419 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 420 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \ 421 PTR_ADDU "%[src], %[src], %[src_stride] \n\t" \ 422 PTR_ADDU "%[dst], %[dst], %[dst_stride] \n\t" \ 423 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \ 424 "bnez %[y], 1b \n\t" \ 425 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 426 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 427 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 428 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 429 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 430 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \ 431 [ftmp12]"=&f"(ftmp[12]), [src2]"+&r"(src2), \ 432 [dst]"+&r"(dst), [src]"+&r"(src), [y]"+&r"(y), [x]"=&r"(x), \ 433 [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0]) \ 434 : [src_stride]"r"(srcstride), [dst_stride]"r"(dststride), \ 435 [filter]"r"(filter), [shift]"f"(shift) \ 436 : "memory" \ 437 ); \ 438 } 439 440 PUT_HEVC_QPEL_BI_H(4, 1, -4, -8, -4); 441 PUT_HEVC_QPEL_BI_H(8, 2, -8, -16, -8); 442 PUT_HEVC_QPEL_BI_H(12, 3, -12, -24, -12); 443 PUT_HEVC_QPEL_BI_H(16, 4, -16, -32, -16); 444 PUT_HEVC_QPEL_BI_H(24, 6, -24, -48, -24); 445 PUT_HEVC_QPEL_BI_H(32, 8, -32, -64, -32); 446 PUT_HEVC_QPEL_BI_H(48, 12, -48, -96, -48); 447 PUT_HEVC_QPEL_BI_H(64, 16, -64, -128, -64); 448 449 #define PUT_HEVC_QPEL_BI_HV(w, x_step, src_step, src2_step, dst_step) \ 450 void ff_hevc_put_hevc_qpel_bi_hv##w##_8_mmi(uint8_t *_dst, \ 451 ptrdiff_t _dststride, \ 452 uint8_t *_src, \ 453 ptrdiff_t _srcstride, \ 454 int16_t *src2, int height, \ 455 intptr_t mx, intptr_t my, \ 456 int width) \ 457 { \ 458 int x, y; \ 459 const int8_t *filter; \ 460 pixel *src = (pixel*)_src; \ 461 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 462 pixel *dst = (pixel *)_dst; \ 463 ptrdiff_t dststride = _dststride / sizeof(pixel); \ 464 int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; \ 465 int16_t *tmp = tmp_array; \ 466 uint64_t ftmp[20]; \ 467 uint64_t rtmp[1]; \ 468 int shift = 7; \ 469 int offset = 64; \ 470 \ 471 src -= (QPEL_EXTRA_BEFORE * srcstride + 3); \ 472 filter = ff_hevc_qpel_filters[mx - 1]; \ 473 x = width >> 2; \ 474 y = height + QPEL_EXTRA; \ 475 __asm__ volatile( \ 476 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 477 "li %[rtmp0], 0x08 \n\t" \ 478 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 479 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 480 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 481 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 482 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 483 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 484 \ 485 "1: \n\t" \ 486 "2: \n\t" \ 487 "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" \ 488 "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" \ 489 "gsldlc1 %[ftmp4], 0x08(%[src]) \n\t" \ 490 "gsldrc1 %[ftmp4], 0x01(%[src]) \n\t" \ 491 "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" \ 492 "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" \ 493 "gsldlc1 %[ftmp6], 0x0a(%[src]) \n\t" \ 494 "gsldrc1 %[ftmp6], 0x03(%[src]) \n\t" \ 495 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 496 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 497 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 498 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 499 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \ 500 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 501 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 502 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 503 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 504 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \ 505 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 506 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 507 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 508 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 509 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \ 510 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 511 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 512 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 513 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 514 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \ 515 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 516 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 517 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 518 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 519 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 520 "gssdlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \ 521 "gssdrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \ 522 \ 523 "daddi %[x], %[x], -0x01 \n\t" \ 524 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 525 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 526 "bnez %[x], 2b \n\t" \ 527 \ 528 "daddi %[y], %[y], -0x01 \n\t" \ 529 "li %[x], " #x_step " \n\t" \ 530 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 531 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \ 532 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 533 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 534 "bnez %[y], 1b \n\t" \ 535 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 536 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 537 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 538 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 539 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 540 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \ 541 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \ 542 [x]"+&r"(x) \ 543 : [filter]"r"(filter), [stride]"r"(srcstride) \ 544 : "memory" \ 545 ); \ 546 \ 547 tmp = tmp_array; \ 548 filter = ff_hevc_qpel_filters[my - 1]; \ 549 x = width >> 2; \ 550 y = height; \ 551 __asm__ volatile( \ 552 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 553 "li %[rtmp0], 0x08 \n\t" \ 554 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 555 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 556 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 557 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 558 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 559 "li %[rtmp0], 0x06 \n\t" \ 560 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 561 "punpcklwd %[offset], %[offset], %[offset] \n\t" \ 562 \ 563 "1: \n\t" \ 564 "li %[x], " #x_step " \n\t" \ 565 "2: \n\t" \ 566 "gsldlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \ 567 "gsldrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \ 568 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 569 "gsldlc1 %[ftmp4], 0x07(%[tmp]) \n\t" \ 570 "gsldrc1 %[ftmp4], 0x00(%[tmp]) \n\t" \ 571 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 572 "gsldlc1 %[ftmp5], 0x07(%[tmp]) \n\t" \ 573 "gsldrc1 %[ftmp5], 0x00(%[tmp]) \n\t" \ 574 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 575 "gsldlc1 %[ftmp6], 0x07(%[tmp]) \n\t" \ 576 "gsldrc1 %[ftmp6], 0x00(%[tmp]) \n\t" \ 577 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 578 "gsldlc1 %[ftmp7], 0x07(%[tmp]) \n\t" \ 579 "gsldrc1 %[ftmp7], 0x00(%[tmp]) \n\t" \ 580 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 581 "gsldlc1 %[ftmp8], 0x07(%[tmp]) \n\t" \ 582 "gsldrc1 %[ftmp8], 0x00(%[tmp]) \n\t" \ 583 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 584 "gsldlc1 %[ftmp9], 0x07(%[tmp]) \n\t" \ 585 "gsldrc1 %[ftmp9], 0x00(%[tmp]) \n\t" \ 586 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 587 "gsldlc1 %[ftmp10], 0x07(%[tmp]) \n\t" \ 588 "gsldrc1 %[ftmp10], 0x00(%[tmp]) \n\t" \ 589 PTR_ADDIU "%[tmp], %[tmp], -0x380 \n\t" \ 590 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 591 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 592 TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10], \ 593 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 594 "pmaddhw %[ftmp11], %[ftmp3], %[ftmp1] \n\t" \ 595 "pmaddhw %[ftmp12], %[ftmp7], %[ftmp2] \n\t" \ 596 "pmaddhw %[ftmp13], %[ftmp4], %[ftmp1] \n\t" \ 597 "pmaddhw %[ftmp14], %[ftmp8], %[ftmp2] \n\t" \ 598 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 599 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 600 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4]) \ 601 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 602 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 603 "pmaddhw %[ftmp11], %[ftmp5], %[ftmp1] \n\t" \ 604 "pmaddhw %[ftmp12], %[ftmp9], %[ftmp2] \n\t" \ 605 "pmaddhw %[ftmp13], %[ftmp6], %[ftmp1] \n\t" \ 606 "pmaddhw %[ftmp14], %[ftmp10], %[ftmp2] \n\t" \ 607 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 608 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 609 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6]) \ 610 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 611 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 612 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 613 "gsldlc1 %[ftmp4], 0x07(%[src2]) \n\t" \ 614 "gsldrc1 %[ftmp4], 0x00(%[src2]) \n\t" \ 615 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ 616 "li %[rtmp0], 0x10 \n\t" \ 617 "dmtc1 %[rtmp0], %[ftmp8] \n\t" \ 618 "punpcklhw %[ftmp5], %[ftmp7], %[ftmp3] \n\t" \ 619 "punpckhhw %[ftmp6], %[ftmp7], %[ftmp3] \n\t" \ 620 "punpckhhw %[ftmp3], %[ftmp7], %[ftmp4] \n\t" \ 621 "punpcklhw %[ftmp4], %[ftmp7], %[ftmp4] \n\t" \ 622 "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t" \ 623 "psraw %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \ 624 "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t" \ 625 "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \ 626 "paddw %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 627 "paddw %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 628 "paddw %[ftmp5], %[ftmp5], %[offset] \n\t" \ 629 "paddw %[ftmp6], %[ftmp6], %[offset] \n\t" \ 630 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \ 631 "psraw %[ftmp6], %[ftmp6], %[shift] \n\t" \ 632 "packsswh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 633 "pcmpgth %[ftmp7], %[ftmp5], %[ftmp7] \n\t" \ 634 "and %[ftmp3], %[ftmp5], %[ftmp7] \n\t" \ 635 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 636 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" \ 637 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" \ 638 \ 639 "daddi %[x], %[x], -0x01 \n\t" \ 640 PTR_ADDIU "%[src2], %[src2], 0x08 \n\t" \ 641 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 642 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \ 643 "bnez %[x], 2b \n\t" \ 644 \ 645 "daddi %[y], %[y], -0x01 \n\t" \ 646 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \ 647 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \ 648 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 649 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \ 650 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 651 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 652 "bnez %[y], 1b \n\t" \ 653 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 654 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 655 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 656 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 657 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 658 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \ 659 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), \ 660 [ftmp14]"=&f"(ftmp[14]), [src2]"+&r"(src2), \ 661 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x), \ 662 [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0]) \ 663 : [filter]"r"(filter), [stride]"r"(dststride), \ 664 [shift]"f"(shift) \ 665 : "memory" \ 666 ); \ 667 } 668 669 PUT_HEVC_QPEL_BI_HV(4, 1, -4, -8, -4); 670 PUT_HEVC_QPEL_BI_HV(8, 2, -8, -16, -8); 671 PUT_HEVC_QPEL_BI_HV(12, 3, -12, -24, -12); 672 PUT_HEVC_QPEL_BI_HV(16, 4, -16, -32, -16); 673 PUT_HEVC_QPEL_BI_HV(24, 6, -24, -48, -24); 674 PUT_HEVC_QPEL_BI_HV(32, 8, -32, -64, -32); 675 PUT_HEVC_QPEL_BI_HV(48, 12, -48, -96, -48); 676 PUT_HEVC_QPEL_BI_HV(64, 16, -64, -128, -64); 677 678 #define PUT_HEVC_EPEL_BI_HV(w, x_step, src_step, src2_step, dst_step) \ 679 void ff_hevc_put_hevc_epel_bi_hv##w##_8_mmi(uint8_t *_dst, \ 680 ptrdiff_t _dststride, \ 681 uint8_t *_src, \ 682 ptrdiff_t _srcstride, \ 683 int16_t *src2, int height, \ 684 intptr_t mx, intptr_t my, \ 685 int width) \ 686 { \ 687 int x, y; \ 688 pixel *src = (pixel *)_src; \ 689 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 690 pixel *dst = (pixel *)_dst; \ 691 ptrdiff_t dststride = _dststride / sizeof(pixel); \ 692 const int8_t *filter = ff_hevc_epel_filters[mx - 1]; \ 693 int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; \ 694 int16_t *tmp = tmp_array; \ 695 uint64_t ftmp[12]; \ 696 uint64_t rtmp[1]; \ 697 int shift = 7; \ 698 int offset = 64; \ 699 \ 700 src -= (EPEL_EXTRA_BEFORE * srcstride + 1); \ 701 x = width >> 2; \ 702 y = height + EPEL_EXTRA; \ 703 __asm__ volatile( \ 704 MMI_LWC1(%[ftmp1], %[filter], 0x00) \ 705 "li %[rtmp0], 0x08 \n\t" \ 706 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 707 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 708 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 709 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 710 \ 711 "1: \n\t" \ 712 "2: \n\t" \ 713 "gslwlc1 %[ftmp2], 0x03(%[src]) \n\t" \ 714 "gslwrc1 %[ftmp2], 0x00(%[src]) \n\t" \ 715 "gslwlc1 %[ftmp3], 0x04(%[src]) \n\t" \ 716 "gslwrc1 %[ftmp3], 0x01(%[src]) \n\t" \ 717 "gslwlc1 %[ftmp4], 0x05(%[src]) \n\t" \ 718 "gslwrc1 %[ftmp4], 0x02(%[src]) \n\t" \ 719 "gslwlc1 %[ftmp5], 0x06(%[src]) \n\t" \ 720 "gslwrc1 %[ftmp5], 0x03(%[src]) \n\t" \ 721 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 722 "pmullh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ 723 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 724 "pmullh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \ 725 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 726 "pmullh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ 727 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 728 "pmullh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ 729 TRANSPOSE_4H(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], \ 730 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9]) \ 731 "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 732 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ 733 "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ 734 "gssdlc1 %[ftmp2], 0x07(%[tmp]) \n\t" \ 735 "gssdrc1 %[ftmp2], 0x00(%[tmp]) \n\t" \ 736 \ 737 "daddi %[x], %[x], -0x01 \n\t" \ 738 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 739 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 740 "bnez %[x], 2b \n\t" \ 741 \ 742 "daddi %[y], %[y], -0x01 \n\t" \ 743 "li %[x], " #x_step " \n\t" \ 744 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 745 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \ 746 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 747 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 748 "bnez %[y], 1b \n\t" \ 749 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 750 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 751 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 752 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 753 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 754 [rtmp0]"=&r"(rtmp[0]), \ 755 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \ 756 [x]"+&r"(x) \ 757 : [filter]"r"(filter), [stride]"r"(srcstride) \ 758 : "memory" \ 759 ); \ 760 \ 761 tmp = tmp_array; \ 762 filter = ff_hevc_epel_filters[my - 1]; \ 763 x = width >> 2; \ 764 y = height; \ 765 __asm__ volatile( \ 766 MMI_LWC1(%[ftmp1], %[filter], 0x00) \ 767 "li %[rtmp0], 0x08 \n\t" \ 768 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 769 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 770 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 771 "li %[rtmp0], 0x06 \n\t" \ 772 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 773 "punpcklwd %[offset], %[offset], %[offset] \n\t" \ 774 "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" \ 775 \ 776 "1: \n\t" \ 777 "li %[x], " #x_step " \n\t" \ 778 "2: \n\t" \ 779 "gsldlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \ 780 "gsldrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \ 781 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 782 "gsldlc1 %[ftmp4], 0x07(%[tmp]) \n\t" \ 783 "gsldrc1 %[ftmp4], 0x00(%[tmp]) \n\t" \ 784 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 785 "gsldlc1 %[ftmp5], 0x07(%[tmp]) \n\t" \ 786 "gsldrc1 %[ftmp5], 0x00(%[tmp]) \n\t" \ 787 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 788 "gsldlc1 %[ftmp6], 0x07(%[tmp]) \n\t" \ 789 "gsldrc1 %[ftmp6], 0x00(%[tmp]) \n\t" \ 790 PTR_ADDIU "%[tmp], %[tmp], -0x180 \n\t" \ 791 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 792 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 793 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" \ 794 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t" \ 795 TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp3], %[ftmp4]) \ 796 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 797 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 798 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp1] \n\t" \ 799 "pmaddhw %[ftmp8], %[ftmp6], %[ftmp1] \n\t" \ 800 TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp5], %[ftmp6]) \ 801 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 802 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 803 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 804 "gsldlc1 %[ftmp4], 0x07(%[src2]) \n\t" \ 805 "gsldrc1 %[ftmp4], 0x00(%[src2]) \n\t" \ 806 "li %[rtmp0], 0x10 \n\t" \ 807 "dmtc1 %[rtmp0], %[ftmp8] \n\t" \ 808 "punpcklhw %[ftmp5], %[ftmp2], %[ftmp3] \n\t" \ 809 "punpckhhw %[ftmp6], %[ftmp2], %[ftmp3] \n\t" \ 810 "punpckhhw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \ 811 "punpcklhw %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ 812 "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t" \ 813 "psraw %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \ 814 "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t" \ 815 "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \ 816 "paddw %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \ 817 "paddw %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \ 818 "paddw %[ftmp5], %[ftmp5], %[offset] \n\t" \ 819 "paddw %[ftmp6], %[ftmp6], %[offset] \n\t" \ 820 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \ 821 "psraw %[ftmp6], %[ftmp6], %[shift] \n\t" \ 822 "packsswh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 823 "pcmpgth %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \ 824 "and %[ftmp3], %[ftmp5], %[ftmp7] \n\t" \ 825 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 826 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" \ 827 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" \ 828 \ 829 "daddi %[x], %[x], -0x01 \n\t" \ 830 PTR_ADDIU "%[src2], %[src2], 0x08 \n\t" \ 831 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 832 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \ 833 "bnez %[x], 2b \n\t" \ 834 \ 835 "daddi %[y], %[y], -0x01 \n\t" \ 836 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \ 837 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \ 838 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 839 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \ 840 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 841 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 842 "bnez %[y], 1b \n\t" \ 843 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 844 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 845 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 846 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 847 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 848 [ftmp10]"=&f"(ftmp[10]), [src2]"+&r"(src2), \ 849 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x), \ 850 [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0]) \ 851 : [filter]"r"(filter), [stride]"r"(dststride), \ 852 [shift]"f"(shift) \ 853 : "memory" \ 854 ); \ 855 } 856 857 PUT_HEVC_EPEL_BI_HV(4, 1, -4, -8, -4); 858 PUT_HEVC_EPEL_BI_HV(8, 2, -8, -16, -8); 859 PUT_HEVC_EPEL_BI_HV(12, 3, -12, -24, -12); 860 PUT_HEVC_EPEL_BI_HV(16, 4, -16, -32, -16); 861 PUT_HEVC_EPEL_BI_HV(24, 6, -24, -48, -24); 862 PUT_HEVC_EPEL_BI_HV(32, 8, -32, -64, -32); 863 864 #define PUT_HEVC_PEL_BI_PIXELS(w, x_step, src_step, dst_step, src2_step) \ 865 void ff_hevc_put_hevc_pel_bi_pixels##w##_8_mmi(uint8_t *_dst, \ 866 ptrdiff_t _dststride, \ 867 uint8_t *_src, \ 868 ptrdiff_t _srcstride, \ 869 int16_t *src2, int height, \ 870 intptr_t mx, intptr_t my, \ 871 int width) \ 872 { \ 873 int x, y; \ 874 pixel *src = (pixel *)_src; \ 875 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 876 pixel *dst = (pixel *)_dst; \ 877 ptrdiff_t dststride = _dststride / sizeof(pixel); \ 878 uint64_t ftmp[12]; \ 879 uint64_t rtmp[1]; \ 880 int shift = 7; \ 881 \ 882 y = height; \ 883 x = width >> 3; \ 884 __asm__ volatile( \ 885 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 886 "li %[rtmp0], 0x06 \n\t" \ 887 "dmtc1 %[rtmp0], %[ftmp1] \n\t" \ 888 "li %[rtmp0], 0x10 \n\t" \ 889 "dmtc1 %[rtmp0], %[ftmp10] \n\t" \ 890 "li %[rtmp0], 0x40 \n\t" \ 891 "dmtc1 %[rtmp0], %[offset] \n\t" \ 892 "punpcklhw %[offset], %[offset], %[offset] \n\t" \ 893 "punpcklwd %[offset], %[offset], %[offset] \n\t" \ 894 \ 895 "1: \n\t" \ 896 "2: \n\t" \ 897 "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" \ 898 "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" \ 899 "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" \ 900 "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" \ 901 "gsldlc1 %[ftmp3], 0x0f(%[src2]) \n\t" \ 902 "gsldrc1 %[ftmp3], 0x08(%[src2]) \n\t" \ 903 "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" \ 904 "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 905 "psllh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ 906 "psllh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ 907 "paddh %[ftmp4], %[ftmp4], %[offset] \n\t" \ 908 "paddh %[ftmp5], %[ftmp5], %[offset] \n\t" \ 909 "punpcklhw %[ftmp6], %[ftmp4], %[ftmp0] \n\t" \ 910 "punpckhhw %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 911 "punpcklhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 912 "punpckhhw %[ftmp9], %[ftmp5], %[ftmp0] \n\t" \ 913 "punpcklhw %[ftmp4], %[ftmp0], %[ftmp3] \n\t" \ 914 "punpckhhw %[ftmp5], %[ftmp0], %[ftmp3] \n\t" \ 915 "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t" \ 916 "punpcklhw %[ftmp2], %[ftmp0], %[ftmp2] \n\t" \ 917 "psraw %[ftmp2], %[ftmp2], %[ftmp10] \n\t" \ 918 "psraw %[ftmp3], %[ftmp3], %[ftmp10] \n\t" \ 919 "psraw %[ftmp4], %[ftmp4], %[ftmp10] \n\t" \ 920 "psraw %[ftmp5], %[ftmp5], %[ftmp10] \n\t" \ 921 "paddw %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \ 922 "paddw %[ftmp3], %[ftmp3], %[ftmp7] \n\t" \ 923 "paddw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \ 924 "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ 925 "psraw %[ftmp2], %[ftmp2], %[shift] \n\t" \ 926 "psraw %[ftmp3], %[ftmp3], %[shift] \n\t" \ 927 "psraw %[ftmp4], %[ftmp4], %[shift] \n\t" \ 928 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \ 929 "packsswh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 930 "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ 931 "pcmpgth %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \ 932 "pcmpgth %[ftmp5], %[ftmp4], %[ftmp0] \n\t" \ 933 "and %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 934 "and %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ 935 "packushb %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ 936 "gssdlc1 %[ftmp2], 0x07(%[dst]) \n\t" \ 937 "gssdrc1 %[ftmp2], 0x00(%[dst]) \n\t" \ 938 \ 939 "daddi %[x], %[x], -0x01 \n\t" \ 940 PTR_ADDIU "%[src], %[src], 0x08 \n\t" \ 941 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t" \ 942 PTR_ADDIU "%[src2], %[src2], 0x10 \n\t" \ 943 "bnez %[x], 2b \n\t" \ 944 \ 945 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 946 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 947 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \ 948 "li %[x], " #x_step " \n\t" \ 949 "daddi %[y], %[y], -0x01 \n\t" \ 950 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" \ 951 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" \ 952 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \ 953 "bnez %[y], 1b \n\t" \ 954 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 955 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 956 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 957 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 958 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 959 [ftmp10]"=&f"(ftmp[10]), [offset]"=&f"(ftmp[11]), \ 960 [src2]"+&r"(src2), [dst]"+&r"(dst), [src]"+&r"(src), \ 961 [x]"+&r"(x), [y]"+&r"(y), [rtmp0]"=&r"(rtmp[0]) \ 962 : [dststride]"r"(dststride), [shift]"f"(shift), \ 963 [srcstride]"r"(srcstride) \ 964 : "memory" \ 965 ); \ 966 } \ 967 968 PUT_HEVC_PEL_BI_PIXELS(8, 1, -8, -8, -16); 969 PUT_HEVC_PEL_BI_PIXELS(16, 2, -16, -16, -32); 970 PUT_HEVC_PEL_BI_PIXELS(24, 3, -24, -24, -48); 971 PUT_HEVC_PEL_BI_PIXELS(32, 4, -32, -32, -64); 972 PUT_HEVC_PEL_BI_PIXELS(48, 6, -48, -48, -96); 973 PUT_HEVC_PEL_BI_PIXELS(64, 8, -64, -64, -128); 974 975 #define PUT_HEVC_QPEL_UNI_HV(w, x_step, src_step, dst_step, tmp_step) \ 976 void ff_hevc_put_hevc_qpel_uni_hv##w##_8_mmi(uint8_t *_dst, \ 977 ptrdiff_t _dststride, \ 978 uint8_t *_src, \ 979 ptrdiff_t _srcstride, \ 980 int height, \ 981 intptr_t mx, intptr_t my, \ 982 int width) \ 983 { \ 984 int x, y; \ 985 const int8_t *filter; \ 986 pixel *src = (pixel*)_src; \ 987 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \ 988 pixel *dst = (pixel *)_dst; \ 989 ptrdiff_t dststride = _dststride / sizeof(pixel); \ 990 int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; \ 991 int16_t *tmp = tmp_array; \ 992 uint64_t ftmp[20]; \ 993 uint64_t rtmp[1]; \ 994 int shift = 6; \ 995 int offset = 32; \ 996 \ 997 src -= (QPEL_EXTRA_BEFORE * srcstride + 3); \ 998 filter = ff_hevc_qpel_filters[mx - 1]; \ 999 x = width >> 2; \ 1000 y = height + QPEL_EXTRA; \ 1001 __asm__ volatile( \ 1002 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 1003 "li %[rtmp0], 0x08 \n\t" \ 1004 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 1005 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 1006 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 1007 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 1008 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 1009 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ 1010 \ 1011 "1: \n\t" \ 1012 "2: \n\t" \ 1013 "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" \ 1014 "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" \ 1015 "gsldlc1 %[ftmp4], 0x08(%[src]) \n\t" \ 1016 "gsldrc1 %[ftmp4], 0x01(%[src]) \n\t" \ 1017 "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" \ 1018 "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" \ 1019 "gsldlc1 %[ftmp6], 0x0a(%[src]) \n\t" \ 1020 "gsldrc1 %[ftmp6], 0x03(%[src]) \n\t" \ 1021 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 1022 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 1023 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 1024 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 1025 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \ 1026 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \ 1027 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 1028 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 1029 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 1030 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \ 1031 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ 1032 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ 1033 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 1034 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 1035 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \ 1036 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ 1037 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ 1038 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \ 1039 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \ 1040 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \ 1041 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 1042 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \ 1043 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 1044 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 1045 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 1046 "gssdlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \ 1047 "gssdrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \ 1048 \ 1049 "daddi %[x], %[x], -0x01 \n\t" \ 1050 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \ 1051 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 1052 "bnez %[x], 2b \n\t" \ 1053 \ 1054 "daddi %[y], %[y], -0x01 \n\t" \ 1055 "li %[x], " #x_step " \n\t" \ 1056 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \ 1057 PTR_ADDIU "%[tmp], %[tmp], " #tmp_step " \n\t" \ 1058 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 1059 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1060 "bnez %[y], 1b \n\t" \ 1061 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 1062 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 1063 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 1064 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 1065 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 1066 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \ 1067 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \ 1068 [x]"+&r"(x) \ 1069 : [filter]"r"(filter), [stride]"r"(srcstride) \ 1070 : "memory" \ 1071 ); \ 1072 \ 1073 tmp = tmp_array; \ 1074 filter = ff_hevc_qpel_filters[my - 1]; \ 1075 x = width >> 2; \ 1076 y = height; \ 1077 __asm__ volatile( \ 1078 MMI_LDC1(%[ftmp1], %[filter], 0x00) \ 1079 "li %[rtmp0], 0x08 \n\t" \ 1080 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 1081 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \ 1082 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \ 1083 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 1084 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 1085 "li %[rtmp0], 0x06 \n\t" \ 1086 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \ 1087 "punpcklhw %[offset], %[offset], %[offset] \n\t" \ 1088 "punpcklwd %[offset], %[offset], %[offset] \n\t" \ 1089 \ 1090 "1: \n\t" \ 1091 "li %[x], " #x_step " \n\t" \ 1092 "2: \n\t" \ 1093 "gsldlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \ 1094 "gsldrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \ 1095 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1096 "gsldlc1 %[ftmp4], 0x07(%[tmp]) \n\t" \ 1097 "gsldrc1 %[ftmp4], 0x00(%[tmp]) \n\t" \ 1098 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1099 "gsldlc1 %[ftmp5], 0x07(%[tmp]) \n\t" \ 1100 "gsldrc1 %[ftmp5], 0x00(%[tmp]) \n\t" \ 1101 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1102 "gsldlc1 %[ftmp6], 0x07(%[tmp]) \n\t" \ 1103 "gsldrc1 %[ftmp6], 0x00(%[tmp]) \n\t" \ 1104 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1105 "gsldlc1 %[ftmp7], 0x07(%[tmp]) \n\t" \ 1106 "gsldrc1 %[ftmp7], 0x00(%[tmp]) \n\t" \ 1107 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1108 "gsldlc1 %[ftmp8], 0x07(%[tmp]) \n\t" \ 1109 "gsldrc1 %[ftmp8], 0x00(%[tmp]) \n\t" \ 1110 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1111 "gsldlc1 %[ftmp9], 0x07(%[tmp]) \n\t" \ 1112 "gsldrc1 %[ftmp9], 0x00(%[tmp]) \n\t" \ 1113 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1114 "gsldlc1 %[ftmp10], 0x07(%[tmp]) \n\t" \ 1115 "gsldrc1 %[ftmp10], 0x00(%[tmp]) \n\t" \ 1116 PTR_ADDIU "%[tmp], %[tmp], -0x380 \n\t" \ 1117 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \ 1118 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 1119 TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10], \ 1120 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \ 1121 "pmaddhw %[ftmp11], %[ftmp3], %[ftmp1] \n\t" \ 1122 "pmaddhw %[ftmp12], %[ftmp7], %[ftmp2] \n\t" \ 1123 "pmaddhw %[ftmp13], %[ftmp4], %[ftmp1] \n\t" \ 1124 "pmaddhw %[ftmp14], %[ftmp8], %[ftmp2] \n\t" \ 1125 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 1126 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 1127 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4]) \ 1128 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 1129 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 1130 "pmaddhw %[ftmp11], %[ftmp5], %[ftmp1] \n\t" \ 1131 "pmaddhw %[ftmp12], %[ftmp9], %[ftmp2] \n\t" \ 1132 "pmaddhw %[ftmp13], %[ftmp6], %[ftmp1] \n\t" \ 1133 "pmaddhw %[ftmp14], %[ftmp10], %[ftmp2] \n\t" \ 1134 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \ 1135 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \ 1136 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6]) \ 1137 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 1138 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \ 1139 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ 1140 "paddh %[ftmp3], %[ftmp3], %[offset] \n\t" \ 1141 "psrah %[ftmp3], %[ftmp3], %[shift] \n\t" \ 1142 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ 1143 "pcmpgth %[ftmp7], %[ftmp3], %[ftmp7] \n\t" \ 1144 "and %[ftmp3], %[ftmp3], %[ftmp7] \n\t" \ 1145 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \ 1146 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" \ 1147 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" \ 1148 \ 1149 "daddi %[x], %[x], -0x01 \n\t" \ 1150 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \ 1151 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \ 1152 "bnez %[x], 2b \n\t" \ 1153 \ 1154 "daddi %[y], %[y], -0x01 \n\t" \ 1155 PTR_ADDIU "%[tmp], %[tmp], " #tmp_step " \n\t" \ 1156 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \ 1157 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1158 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \ 1159 "bnez %[y], 1b \n\t" \ 1160 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \ 1161 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \ 1162 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \ 1163 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \ 1164 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \ 1165 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \ 1166 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), \ 1167 [ftmp14]"=&f"(ftmp[14]), \ 1168 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x), \ 1169 [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0]) \ 1170 : [filter]"r"(filter), [stride]"r"(dststride), \ 1171 [shift]"f"(shift) \ 1172 : "memory" \ 1173 ); \ 1174 } 1175 1176 PUT_HEVC_QPEL_UNI_HV(4, 1, -4, -4, -8); 1177 PUT_HEVC_QPEL_UNI_HV(8, 2, -8, -8, -16); 1178 PUT_HEVC_QPEL_UNI_HV(12, 3, -12, -12, -24); 1179 PUT_HEVC_QPEL_UNI_HV(16, 4, -16, -16, -32); 1180 PUT_HEVC_QPEL_UNI_HV(24, 6, -24, -24, -48); 1181 PUT_HEVC_QPEL_UNI_HV(32, 8, -32, -32, -64); 1182 PUT_HEVC_QPEL_UNI_HV(48, 12, -48, -48, -96); 1183 PUT_HEVC_QPEL_UNI_HV(64, 16, -64, -64, -128); 1184