• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright (c) 2011 The Chromium Authors. All rights reserved.
2; Use of this source code is governed by a BSD-style license that can be
3; found in the LICENSE file.
4
5%include "media/base/simd/media_export.asm"
6
7  EXPORT    SYMBOL
8  align     function_align
9
10; Non-PIC code is the fastest so use this if possible.
11%ifndef PIC
12mangle(SYMBOL):
13  %assign   stack_offset 0
14  PROLOGUE  6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP
15  extern    mangle(kCoefficientsRgbY)
16  jmp       .convertend
17
18.convertloop:
19  movzx     TEMPd, BYTE [Uq]
20  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq]
21  add       Uq, 1
22  movzx     TEMPd, BYTE [Vq]
23  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq]
24  add       Vq, 1
25  movzx     TEMPd, BYTE [Yq]
26  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
27  movzx     TEMPd, BYTE [Yq + 1]
28  movq      mm2, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
29  add       Yq, 2
30  paddsw    mm1, mm0
31  paddsw    mm2, mm0
32  psraw     mm1, 6
33  psraw     mm2, 6
34  packuswb  mm1, mm2
35
36  ; Multiply ARGB by alpha value.
37  movq      mm0, mm1
38  pxor      mm2, mm2
39  punpcklbw mm0, mm2
40  punpckhbw mm1, mm2
41  movzx     TEMPd, BYTE [Aq]
42  movq      mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq]
43  pmullw    mm0, mm2
44  psrlw     mm0, 8
45  movzx     TEMPd, BYTE [Aq + 1]
46  movq      mm2, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq]
47  add       Aq, 2
48  pmullw    mm1, mm2
49  psrlw     mm1, 8
50  packuswb  mm0, mm1
51
52  MOVQ      [ARGBq], mm0
53  add       ARGBq, 8
54
55.convertend:
56  sub       WIDTHq, 2
57  jns       .convertloop
58
59  ; If number of pixels is odd then compute it.
60  and       WIDTHq, 1
61  jz        .convertdone
62
63  movzx     TEMPd, BYTE [Uq]
64  movq      mm0, [mangle(kCoefficientsRgbY) + 2048 + 8 * TEMPq]
65  movzx     TEMPd, BYTE [Vq]
66  paddsw    mm0, [mangle(kCoefficientsRgbY) + 4096 + 8 * TEMPq]
67  movzx     TEMPd, BYTE [Yq]
68  movq      mm1, [mangle(kCoefficientsRgbY) + 8 * TEMPq]
69  paddsw    mm1, mm0
70  psraw     mm1, 6
71  packuswb  mm1, mm1
72
73  ; Multiply ARGB by alpha value.
74  pxor      mm0, mm0
75  punpcklbw mm1, mm0
76  movzx     TEMPd, BYTE [Aq]
77  movq      mm0, [mangle(kCoefficientsRgbY) + 6144 + 8 * TEMPq]
78  pmullw    mm1, mm0
79  psrlw     mm1, 8
80  packuswb  mm1, mm1
81
82  movd      [ARGBq], mm1
83
84.convertdone:
85  RET
86%endif
87
88; With PIC code we need to load the address of mangle(kCoefficientsRgbY).
89; This code is slower than the above version.
90%ifdef PIC
91mangle(SYMBOL):
92  %assign   stack_offset 0
93  PROLOGUE  6, 7, 3, Y, U, V, A, ARGB, WIDTH, TEMP
94  extern    mangle(kCoefficientsRgbY)
95  PUSH      WIDTHq
96  DEFINE_ARGS Y, U, V, A, ARGB, TABLE, TEMP
97  LOAD_SYM  TABLEq, mangle(kCoefficientsRgbY)
98  jmp       .convertend
99
100.convertloop:
101  movzx     TEMPd, BYTE [Uq]
102  movq      mm0, [TABLEq + 2048 + 8 * TEMPq]
103  add       Uq, 1
104
105  movzx     TEMPd, BYTE [Vq]
106  paddsw    mm0, [TABLEq + 4096 + 8 * TEMPq]
107  add       Vq, 1
108
109  movzx     TEMPd, BYTE [Yq]
110  movq      mm1, [TABLEq + 8 * TEMPq]
111
112  movzx     TEMPd, BYTE [Yq + 1]
113  movq      mm2, [TABLEq + 8 * TEMPq]
114  add       Yq, 2
115
116  ; Add UV components to Y component.
117  paddsw    mm1, mm0
118  paddsw    mm2, mm0
119
120  ; Down shift and then pack.
121  psraw     mm1, 6
122  psraw     mm2, 6
123  packuswb  mm1, mm2
124
125  ; Unpack and multiply by alpha value, then repack high bytes of words.
126  movq      mm0, mm1
127  pxor      mm2, mm2
128  punpcklbw mm0, mm2
129  punpckhbw mm1, mm2
130  movzx     TEMPd, BYTE [Aq]
131  movq      mm2, [TABLEq + 6144 + 8 * TEMPq]
132  pmullw    mm0, mm2
133  psrlw     mm0, 8
134  movzx     TEMPd, BYTE [Aq + 1]
135  movq      mm2, [TABLEq + 6144 + 8 * TEMPq]
136  add       Aq, 2
137  pmullw    mm1, mm2
138  psrlw     mm1, 8
139  packuswb  mm0, mm1
140
141  MOVQ      [ARGBq], mm0
142  add       ARGBq, 8
143
144.convertend:
145  sub       dword [rsp], 2
146  jns       .convertloop
147
148  ; If number of pixels is odd then compute it.
149  and       dword [rsp], 1
150  jz        .convertdone
151
152  movzx     TEMPd, BYTE [Uq]
153  movq      mm0, [TABLEq + 2048 + 8 * TEMPq]
154  movzx     TEMPd, BYTE [Vq]
155  paddsw    mm0, [TABLEq + 4096 + 8 * TEMPq]
156  movzx     TEMPd, BYTE [Yq]
157  movq      mm1, [TABLEq + 8 * TEMPq]
158  paddsw    mm1, mm0
159  psraw     mm1, 6
160  packuswb  mm1, mm1
161
162  ; Multiply ARGB by alpha value.
163  pxor      mm0, mm0
164  punpcklbw mm1, mm0
165  movzx     TEMPd, BYTE [Aq]
166  movq      mm0, [TABLEq + 6144 + 8 * TEMPq]
167  pmullw    mm1, mm0
168  psrlw     mm1, 8
169  packuswb  mm1, mm1
170
171  movd      [ARGBq], mm1
172
173.convertdone:
174  POP       TABLEq
175  RET
176%endif