1; Copyright (c) 2011 The Chromium Authors. All rights reserved. 2; Use of this source code is governed by a BSD-style license that can be 3; found in the LICENSE file. 4 5%include "media/base/simd/media_export.asm" 6 7 EXPORT SYMBOL 8 align function_align 9 10; Non-PIC code is the fastest so use this if possible. 11%ifndef PIC 12mangle(SYMBOL): 13 %assign stack_offset 0 14 PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMPU, TEMPV 15 extern mangle(kCoefficientsRgbY) 16 jmp .convertend 17 18.convertloop: 19 movzx TEMPUd, BYTE [Uq] 20 add Uq, 1 21 movzx TEMPVd, BYTE [Vq] 22 add Vq, 1 23 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq] 24 movzx TEMPUd, BYTE [Yq] 25 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq] 26 movzx TEMPVd, BYTE [Yq + 1] 27 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq] 28 add Yq, 2 29 movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPVq] 30 paddsw mm1, mm0 31 paddsw mm2, mm0 32 psraw mm1, 6 33 psraw mm2, 6 34 packuswb mm1, mm2 35 MOVQ [ARGBq], mm1 36 add ARGBq, 8 37 38.convertend: 39 sub WIDTHq, 2 40 jns .convertloop 41 42 ; If number of pixels is odd then compute it. 43 and WIDTHq, 1 44 jz .convertdone 45 46 movzx TEMPUd, BYTE [Uq] 47 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPUq] 48 movzx TEMPVd, BYTE [Vq] 49 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPVq] 50 movzx TEMPUd, BYTE [Yq] 51 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPUq] 52 paddsw mm1, mm0 53 psraw mm1, 6 54 packuswb mm1, mm1 55 movd [ARGBq], mm1 56 57.convertdone: 58 RET 59%endif 60 61; With PIC code we need to load the address of mangle(kCoefficientsRgbY). 62; This code is slower than the above version. 63%ifdef PIC 64mangle(SYMBOL): 65 %assign stack_offset 0 66 PROLOGUE 5, 7, 3, Y, U, V, ARGB, WIDTH, TEMP, TABLE 67 68 extern mangle(kCoefficientsRgbY) 69 LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) 70 71 jmp .convertend 72 73.convertloop: 74 movzx TEMPd, BYTE [Uq] 75 movq mm0, [TABLEq + 2048 + 8 * TEMPq] 76 add Uq, 1 77 78 movzx TEMPd, BYTE [Vq] 79 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] 80 add Vq, 1 81 82 movzx TEMPd, BYTE [Yq] 83 movq mm1, [TABLEq + 8 * TEMPq] 84 85 movzx TEMPd, BYTE [Yq + 1] 86 movq mm2, [TABLEq + 8 * TEMPq] 87 add Yq, 2 88 89 ; Add UV components to Y component. 90 paddsw mm1, mm0 91 paddsw mm2, mm0 92 93 ; Down shift and then pack. 94 psraw mm1, 6 95 psraw mm2, 6 96 packuswb mm1, mm2 97 MOVQ [ARGBq], mm1 98 add ARGBq, 8 99 100.convertend: 101 sub WIDTHq, 2 102 jns .convertloop 103 104 ; If number of pixels is odd then compute it. 105 and WIDTHq, 1 106 jz .convertdone 107 108 movzx TEMPd, BYTE [Uq] 109 movq mm0, [TABLEq + 2048 + 8 * TEMPq] 110 movzx TEMPd, BYTE [Vq] 111 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] 112 movzx TEMPd, BYTE [Yq] 113 movq mm1, [TABLEq + 8 * TEMPq] 114 paddsw mm1, mm0 115 psraw mm1, 6 116 packuswb mm1, mm1 117 movd [ARGBq], mm1 118 119.convertdone: 120 RET 121%endif 122