1;****************************************************************************** 2;* Copyright Nick Kurshev 3;* Copyright Michael (michaelni@gmx.at) 4;* Copyright 2018 Jokyo Images 5;* Copyright Ivo van Poorten 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28pb_mask_shuffle2103_mmx times 8 dw 255 29pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 30pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13 31pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 32pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14 33pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 34 35SECTION .text 36 37%macro RSHIFT_COPY 3 38; %1 dst ; %2 src ; %3 shift 39%if cpuflag(avx) 40 psrldq %1, %2, %3 41%else 42 mova %1, %2 43 RSHIFT %1, %3 44%endif 45%endmacro 46 47;------------------------------------------------------------------------------ 48; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size) 49;------------------------------------------------------------------------------ 50INIT_MMX mmxext 51cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x 52 mova m6, [pb_mask_shuffle2103_mmx] 53 mova m7, m6 54 psllq m7, 8 55 56 movsxdifnidn wq, wd 57 mov xq, wq 58 59 add srcq, wq 60 add dstq, wq 61 neg wq 62 63;calc scalar loop 64 and xq, mmsize*2 -4 65 je .loop_simd 66 67.loop_scalar: 68 mov tmpb, [srcq + wq + 2] 69 mov [dstq+wq + 0], tmpb 70 mov tmpb, [srcq + wq + 1] 71 mov [dstq+wq + 1], tmpb 72 mov tmpb, [srcq + wq + 0] 73 mov [dstq+wq + 2], tmpb 74 mov tmpb, [srcq + wq + 3] 75 mov [dstq+wq + 3], tmpb 76 add wq, 4 77 sub xq, 4 78 jg .loop_scalar 79 80;check if src_size < mmsize * 2 81cmp wq, 0 82jge .end 83 84.loop_simd: 85 movu m0, [srcq+wq] 86 movu m1, [srcq+wq+8] 87 88 pshufw m3, m0, 177 89 pshufw m5, m1, 177 90 91 pand m0, m7 92 pand m3, m6 93 94 pand m1, m7 95 pand m5, m6 96 97 por m0, m3 98 por m1, m5 99 100 movu [dstq+wq], m0 101 movu [dstq+wq + 8], m1 102 103 add wq, mmsize*2 104 jl .loop_simd 105 106.end: 107 RET 108 109;------------------------------------------------------------------------------ 110; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) 111;------------------------------------------------------------------------------ 112; %1-4 index shuffle 113%macro SHUFFLE_BYTES 4 114cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x 115 VBROADCASTI128 m0, [pb_shuffle%1%2%3%4] 116 movsxdifnidn wq, wd 117 mov xq, wq 118 119 add srcq, wq 120 add dstq, wq 121 neg wq 122 123;calc scalar loop 124 and xq, mmsize-4 125 je .loop_simd 126 127.loop_scalar: 128 mov tmpb, [srcq + wq + %1] 129 mov [dstq+wq + 0], tmpb 130 mov tmpb, [srcq + wq + %2] 131 mov [dstq+wq + 1], tmpb 132 mov tmpb, [srcq + wq + %3] 133 mov [dstq+wq + 2], tmpb 134 mov tmpb, [srcq + wq + %4] 135 mov [dstq+wq + 3], tmpb 136 add wq, 4 137 sub xq, 4 138 jg .loop_scalar 139 140;check if src_size < mmsize 141cmp wq, 0 142jge .end 143 144.loop_simd: 145 movu m1, [srcq+wq] 146 pshufb m1, m0 147 movu [dstq+wq], m1 148 add wq, mmsize 149 jl .loop_simd 150 151.end: 152 RET 153%endmacro 154 155INIT_XMM ssse3 156SHUFFLE_BYTES 2, 1, 0, 3 157SHUFFLE_BYTES 0, 3, 2, 1 158SHUFFLE_BYTES 1, 2, 3, 0 159SHUFFLE_BYTES 3, 0, 1, 2 160SHUFFLE_BYTES 3, 2, 1, 0 161 162;----------------------------------------------------------------------------------------------- 163; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 164; const uint8_t *src, int width, int height, 165; int lumStride, int chromStride, int srcStride) 166;----------------------------------------------------------------------------------------------- 167%macro UYVY_TO_YUV422 0 168cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w 169 pxor m0, m0 170 pcmpeqw m1, m1 171 psrlw m1, 8 172 173 movsxdifnidn wq, wd 174 movsxdifnidn lum_strideq, lum_strided 175 movsxdifnidn chrom_strideq, chrom_strided 176 movsxdifnidn src_strideq, src_strided 177 178 mov back_wq, wq 179 mov whalfq, wq 180 shr whalfq, 1 ; whalf = width / 2 181 182 lea srcq, [srcq + wq * 2] 183 add ydstq, wq 184 add udstq, whalfq 185 add vdstq, whalfq 186 187.loop_line: 188 mov xq, wq 189 mov wtwoq, wq 190 add wtwoq, wtwoq ; wtwo = width * 2 191 192 neg wq 193 neg wtwoq 194 neg whalfq 195 196 ;calc scalar loop count 197 and xq, mmsize * 2 - 1 198 je .loop_simd 199 200 .loop_scalar: 201 mov tmpb, [srcq + wtwoq + 0] 202 mov [udstq + whalfq], tmpb 203 204 mov tmpb, [srcq + wtwoq + 1] 205 mov [ydstq + wq], tmpb 206 207 mov tmpb, [srcq + wtwoq + 2] 208 mov [vdstq + whalfq], tmpb 209 210 mov tmpb, [srcq + wtwoq + 3] 211 mov [ydstq + wq + 1], tmpb 212 213 add wq, 2 214 add wtwoq, 4 215 add whalfq, 1 216 sub xq, 2 217 jg .loop_scalar 218 219 ; check if simd loop is need 220 cmp wq, 0 221 jge .end_line 222 223 .loop_simd: 224 movu m2, [srcq + wtwoq ] 225 movu m3, [srcq + wtwoq + mmsize ] 226 movu m4, [srcq + wtwoq + mmsize * 2] 227 movu m5, [srcq + wtwoq + mmsize * 3] 228 229 ; extract y part 1 230 RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... 231 pand m6, m1; YxYx YxYx... 232 233 RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... 234 pand m7, m1 ; YxYx YxYx... 235 236 packuswb m6, m7 ; YYYY YYYY... 237 movu [ydstq + wq], m6 238 239 ; extract y part 2 240 RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... 241 pand m6, m1; YxYx YxYx... 242 243 RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... 244 pand m7, m1 ; YxYx YxYx... 245 246 packuswb m6, m7 ; YYYY YYYY... 247 movu [ydstq + wq + mmsize], m6 248 249 ; extract uv 250 pand m2, m1 ; UxVx... 251 pand m3, m1 ; UxVx... 252 pand m4, m1 ; UxVx... 253 pand m5, m1 ; UxVx... 254 255 packuswb m2, m3 ; UVUV... 256 packuswb m4, m5 ; UVUV... 257 258 ; U 259 pand m6, m2, m1 ; UxUx... 260 pand m7, m4, m1 ; UxUx... 261 262 packuswb m6, m7 ; UUUU 263 movu [udstq + whalfq], m6 264 265 266 ; V 267 psrlw m2, 8 ; VxVx... 268 psrlw m4, 8 ; VxVx... 269 packuswb m2, m4 ; VVVV 270 movu [vdstq + whalfq], m2 271 272 add whalfq, mmsize 273 add wtwoq, mmsize * 4 274 add wq, mmsize * 2 275 jl .loop_simd 276 277 .end_line: 278 add srcq, src_strideq 279 add ydstq, lum_strideq 280 add udstq, chrom_strideq 281 add vdstq, chrom_strideq 282 283 ;restore initial state of line variable 284 mov wq, back_wq 285 mov xq, wq 286 mov whalfq, wq 287 shr whalfq, 1 ; whalf = width / 2 288 sub hd, 1 289 jg .loop_line 290 291 RET 292%endmacro 293 294%if ARCH_X86_64 295INIT_XMM sse2 296UYVY_TO_YUV422 297 298INIT_XMM avx 299UYVY_TO_YUV422 300%endif 301