1;****************************************************************************** 2;* Copyright Nick Kurshev 3;* Copyright Michael (michaelni@gmx.at) 4;* Copyright 2018 Jokyo Images 5;* Copyright Ivo van Poorten 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28pb_mask_shuffle2103_mmx times 8 dw 255 29pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 30pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13 31pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 32pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14 33pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 34 35SECTION .text 36 37%macro RSHIFT_COPY 3 38; %1 dst ; %2 src ; %3 shift 39%if cpuflag(avx) 40 psrldq %1, %2, %3 41%else 42 mova %1, %2 43 RSHIFT %1, %3 44%endif 45%endmacro 46 47;------------------------------------------------------------------------------ 48; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size) 49;------------------------------------------------------------------------------ 50INIT_MMX mmxext 51cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x 52 mova m6, [pb_mask_shuffle2103_mmx] 53 mova m7, m6 54 psllq m7, 8 55 56 movsxdifnidn wq, wd 57 mov xq, wq 58 59 add srcq, wq 60 add dstq, wq 61 neg wq 62 63;calc scalar loop 64 and xq, mmsize*2 -4 65 je .loop_simd 66 67.loop_scalar: 68 mov tmpb, [srcq + wq + 2] 69 mov [dstq+wq + 0], tmpb 70 mov tmpb, [srcq + wq + 1] 71 mov [dstq+wq + 1], tmpb 72 mov tmpb, [srcq + wq + 0] 73 mov [dstq+wq + 2], tmpb 74 mov tmpb, [srcq + wq + 3] 75 mov [dstq+wq + 3], tmpb 76 add wq, 4 77 sub xq, 4 78 jg .loop_scalar 79 80;check if src_size < mmsize * 2 81cmp wq, 0 82jge .end 83 84.loop_simd: 85 movu m0, [srcq+wq] 86 movu m1, [srcq+wq+8] 87 88 pshufw m3, m0, 177 89 pshufw m5, m1, 177 90 91 pand m0, m7 92 pand m3, m6 93 94 pand m1, m7 95 pand m5, m6 96 97 por m0, m3 98 por m1, m5 99 100 movu [dstq+wq], m0 101 movu [dstq+wq + 8], m1 102 103 add wq, mmsize*2 104 jl .loop_simd 105 106.end: 107 RET 108 109;------------------------------------------------------------------------------ 110; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) 111;------------------------------------------------------------------------------ 112; %1-4 index shuffle 113%macro SHUFFLE_BYTES 4 114cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x 115 VBROADCASTI128 m0, [pb_shuffle%1%2%3%4] 116 movsxdifnidn wq, wd 117 mov xq, wq 118 119 add srcq, wq 120 add dstq, wq 121 neg wq 122 123;calc scalar loop 124 and xq, mmsize-4 125 je .loop_simd 126 127.loop_scalar: 128 mov tmpb, [srcq + wq + %1] 129 mov [dstq+wq + 0], tmpb 130 mov tmpb, [srcq + wq + %2] 131 mov [dstq+wq + 1], tmpb 132 mov tmpb, [srcq + wq + %3] 133 mov [dstq+wq + 2], tmpb 134 mov tmpb, [srcq + wq + %4] 135 mov [dstq+wq + 3], tmpb 136 add wq, 4 137 sub xq, 4 138 jg .loop_scalar 139 140;check if src_size < mmsize 141cmp wq, 0 142jge .end 143 144.loop_simd: 145 movu m1, [srcq+wq] 146 pshufb m1, m0 147 movu [dstq+wq], m1 148 add wq, mmsize 149 jl .loop_simd 150 151.end: 152 RET 153%endmacro 154 155INIT_XMM ssse3 156SHUFFLE_BYTES 2, 1, 0, 3 157SHUFFLE_BYTES 0, 3, 2, 1 158SHUFFLE_BYTES 1, 2, 3, 0 159SHUFFLE_BYTES 3, 0, 1, 2 160SHUFFLE_BYTES 3, 2, 1, 0 161 162%if ARCH_X86_64 163%if HAVE_AVX2_EXTERNAL 164INIT_YMM avx2 165SHUFFLE_BYTES 2, 1, 0, 3 166SHUFFLE_BYTES 0, 3, 2, 1 167SHUFFLE_BYTES 1, 2, 3, 0 168SHUFFLE_BYTES 3, 0, 1, 2 169SHUFFLE_BYTES 3, 2, 1, 0 170%endif 171%endif 172 173;----------------------------------------------------------------------------------------------- 174; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 175; const uint8_t *src, int width, int height, 176; int lumStride, int chromStride, int srcStride) 177;----------------------------------------------------------------------------------------------- 178%macro UYVY_TO_YUV422 0 179cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w 180 pxor m0, m0 181 pcmpeqw m1, m1 182 psrlw m1, 8 183 184 movsxdifnidn wq, wd 185 movsxdifnidn lum_strideq, lum_strided 186 movsxdifnidn chrom_strideq, chrom_strided 187 movsxdifnidn src_strideq, src_strided 188 189 mov back_wq, wq 190 mov whalfq, wq 191 shr whalfq, 1 ; whalf = width / 2 192 193 lea srcq, [srcq + wq * 2] 194 add ydstq, wq 195 add udstq, whalfq 196 add vdstq, whalfq 197 198.loop_line: 199 mov xq, wq 200 mov wtwoq, wq 201 add wtwoq, wtwoq ; wtwo = width * 2 202 203 neg wq 204 neg wtwoq 205 neg whalfq 206 207 ;calc scalar loop count 208 and xq, mmsize * 2 - 1 209 je .loop_simd 210 211 .loop_scalar: 212 mov tmpb, [srcq + wtwoq + 0] 213 mov [udstq + whalfq], tmpb 214 215 mov tmpb, [srcq + wtwoq + 1] 216 mov [ydstq + wq], tmpb 217 218 mov tmpb, [srcq + wtwoq + 2] 219 mov [vdstq + whalfq], tmpb 220 221 mov tmpb, [srcq + wtwoq + 3] 222 mov [ydstq + wq + 1], tmpb 223 224 add wq, 2 225 add wtwoq, 4 226 add whalfq, 1 227 sub xq, 2 228 jg .loop_scalar 229 230 ; check if simd loop is need 231 cmp wq, 0 232 jge .end_line 233 234 .loop_simd: 235 movu m2, [srcq + wtwoq ] 236 movu m3, [srcq + wtwoq + mmsize ] 237 movu m4, [srcq + wtwoq + mmsize * 2] 238 movu m5, [srcq + wtwoq + mmsize * 3] 239 240 ; extract y part 1 241 RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... 242 pand m6, m1; YxYx YxYx... 243 244 RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... 245 pand m7, m1 ; YxYx YxYx... 246 247 packuswb m6, m7 ; YYYY YYYY... 248 movu [ydstq + wq], m6 249 250 ; extract y part 2 251 RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... 252 pand m6, m1; YxYx YxYx... 253 254 RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... 255 pand m7, m1 ; YxYx YxYx... 256 257 packuswb m6, m7 ; YYYY YYYY... 258 movu [ydstq + wq + mmsize], m6 259 260 ; extract uv 261 pand m2, m1 ; UxVx... 262 pand m3, m1 ; UxVx... 263 pand m4, m1 ; UxVx... 264 pand m5, m1 ; UxVx... 265 266 packuswb m2, m3 ; UVUV... 267 packuswb m4, m5 ; UVUV... 268 269 ; U 270 pand m6, m2, m1 ; UxUx... 271 pand m7, m4, m1 ; UxUx... 272 273 packuswb m6, m7 ; UUUU 274 movu [udstq + whalfq], m6 275 276 277 ; V 278 psrlw m2, 8 ; VxVx... 279 psrlw m4, 8 ; VxVx... 280 packuswb m2, m4 ; VVVV 281 movu [vdstq + whalfq], m2 282 283 add whalfq, mmsize 284 add wtwoq, mmsize * 4 285 add wq, mmsize * 2 286 jl .loop_simd 287 288 .end_line: 289 add srcq, src_strideq 290 add ydstq, lum_strideq 291 add udstq, chrom_strideq 292 add vdstq, chrom_strideq 293 294 ;restore initial state of line variable 295 mov wq, back_wq 296 mov xq, wq 297 mov whalfq, wq 298 shr whalfq, 1 ; whalf = width / 2 299 sub hd, 1 300 jg .loop_line 301 302 RET 303%endmacro 304 305%if ARCH_X86_64 306INIT_XMM sse2 307UYVY_TO_YUV422 308 309INIT_XMM avx 310UYVY_TO_YUV422 311%endif 312