1; Copyright (c) 2011 The Chromium Authors. All rights reserved. 2; Use of this source code is governed by a BSD-style license that can be 3; found in the LICENSE file. 4 5%include "media/base/simd/media_export.asm" 6 7 EXPORT SYMBOL 8 align function_align 9 10; Non-PIC code is the fastest so use this if possible. 11%ifndef PIC 12mangle(SYMBOL): 13 %assign stack_offset 0 14 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP 15 extern mangle(kCoefficientsRgbY) 16 jmp .convertend 17 18.convertloop: 19 movzx TEMPd, BYTE [Uq] 20 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] 21 add Uq, 1 22 movzx TEMPd, BYTE [Vq] 23 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] 24 add Vq, 1 25 movzx TEMPd, BYTE [Yq] 26 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] 27 movzx TEMPd, BYTE [Yq + 1] 28 movq mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq] 29 add Yq, 2 30 paddsw mm1, mm0 31 paddsw mm2, mm0 32 psraw mm1, 6 33 psraw mm2, 6 34 packuswb mm1, mm2 35 36 ; Multiply ARGB by alpha value. 37 movq mm0, mm1 38 pxor mm2, mm2 39 punpcklbw mm0, mm2 40 punpckhbw mm1, mm2 41 movzx TEMPd, BYTE [Aq] 42 movq mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] 43 pmullw mm0, mm2 44 psrlw mm0, 8 45 movzx TEMPd, BYTE [Aq + 1] 46 movq mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] 47 add Aq, 2 48 pmullw mm1, mm2 49 psrlw mm1, 8 50 packuswb mm0, mm1 51 52 MOVQ [ARGBq], mm0 53 add ARGBq, 8 54 55.convertend: 56 sub WIDTHq, 2 57 jns .convertloop 58 59 ; If number of pixels is odd then compute it. 60 and WIDTHq, 1 61 jz .convertdone 62 63 movzx TEMPd, BYTE [Uq] 64 movq mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq] 65 movzx TEMPd, BYTE [Vq] 66 paddsw mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq] 67 movzx TEMPd, BYTE [Yq] 68 movq mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq] 69 paddsw mm1, mm0 70 psraw mm1, 6 71 packuswb mm1, mm1 72 73 ; Multiply ARGB by alpha value. 74 pxor mm0, mm0 75 punpcklbw mm1, mm0 76 movzx TEMPd, BYTE [Aq] 77 movq mm0, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq] 78 pmullw mm1, mm0 79 psrlw mm1, 8 80 packuswb mm1, mm1 81 82 movd [ARGBq], mm1 83 84.convertdone: 85 RET 86%endif 87 88; With PIC code we need to load the address of mangle(kCoefficientsRgbY). 89; This code is slower than the above version. 90%ifdef PIC 91mangle(SYMBOL): 92 %assign stack_offset 0 93 PROLOGUE 6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP 94 extern mangle(kCoefficientsRgbY) 95 PUSH WIDTHq 96 DEFINE_ARGS Y, U, V, A, ARGB, TABLE, TEMP 97 LOAD_SYM TABLEq, mangle(kCoefficientsRgbY) 98 jmp .convertend 99 100.convertloop: 101 movzx TEMPd, BYTE [Uq] 102 movq mm0, [TABLEq + 2048 + 8 * TEMPq] 103 add Uq, 1 104 105 movzx TEMPd, BYTE [Vq] 106 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] 107 add Vq, 1 108 109 movzx TEMPd, BYTE [Yq] 110 movq mm1, [TABLEq + 8 * TEMPq] 111 112 movzx TEMPd, BYTE [Yq + 1] 113 movq mm2, [TABLEq + 8 * TEMPq] 114 add Yq, 2 115 116 ; Add UV components to Y component. 117 paddsw mm1, mm0 118 paddsw mm2, mm0 119 120 ; Down shift and then pack. 121 psraw mm1, 6 122 psraw mm2, 6 123 packuswb mm1, mm2 124 125 ; Unpack and multiply by alpha value, then repack high bytes of words. 126 movq mm0, mm1 127 pxor mm2, mm2 128 punpcklbw mm0, mm2 129 punpckhbw mm1, mm2 130 movzx TEMPd, BYTE [Aq] 131 movq mm2, [TABLEq + 6144 + 8 * TEMPq] 132 pmullw mm0, mm2 133 psrlw mm0, 8 134 movzx TEMPd, BYTE [Aq + 1] 135 movq mm2, [TABLEq + 6144 + 8 * TEMPq] 136 add Aq, 2 137 pmullw mm1, mm2 138 psrlw mm1, 8 139 packuswb mm0, mm1 140 141 MOVQ [ARGBq], mm0 142 add ARGBq, 8 143 144.convertend: 145 sub dword [rsp], 2 146 jns .convertloop 147 148 ; If number of pixels is odd then compute it. 149 and dword [rsp], 1 150 jz .convertdone 151 152 movzx TEMPd, BYTE [Uq] 153 movq mm0, [TABLEq + 2048 + 8 * TEMPq] 154 movzx TEMPd, BYTE [Vq] 155 paddsw mm0, [TABLEq + 4096 + 8 * TEMPq] 156 movzx TEMPd, BYTE [Yq] 157 movq mm1, [TABLEq + 8 * TEMPq] 158 paddsw mm1, mm0 159 psraw mm1, 6 160 packuswb mm1, mm1 161 162 ; Multiply ARGB by alpha value. 163 pxor mm0, mm0 164 punpcklbw mm1, mm0 165 movzx TEMPd, BYTE [Aq] 166 movq mm0, [TABLEq + 6144 + 8 * TEMPq] 167 pmullw mm1, mm0 168 psrlw mm1, 8 169 packuswb mm1, mm1 170 171 movd [ARGBq], mm1 172 173.convertdone: 174 POP TABLEq 175 RET 176%endif