• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **
3  ** Copyright 2009, The Android Open Source Project
4  **
5  ** Licensed under the Apache License, Version 2.0 (the "License");
6  ** you may not use this file except in compliance with the License.
7  ** You may obtain a copy of the License at
8  **
9  **     http://www.apache.org/licenses/LICENSE-2.0
10  **
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  */
17 
18 #include <emmintrin.h>
19 #include "SkBitmapProcState_opts_SSE2.h"
20 #include "SkUtils.h"
21 
S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)22 void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
23                                    const uint32_t* xy,
24                                    int count, uint32_t* colors) {
25     SkASSERT(count > 0 && colors != NULL);
26     SkASSERT(s.fDoFilter);
27     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
28     SkASSERT(s.fAlphaScale == 256);
29 
30     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
31     unsigned rb = s.fBitmap->rowBytes();
32     uint32_t XY = *xy++;
33     unsigned y0 = XY >> 14;
34     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
35     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
36     unsigned subY = y0 & 0xF;
37 
38     // ( 0,  0,  0,  0,  0,  0,  0, 16)
39     __m128i sixteen = _mm_cvtsi32_si128(16);
40 
41     // ( 0,  0,  0,  0, 16, 16, 16, 16)
42     sixteen = _mm_shufflelo_epi16(sixteen, 0);
43 
44     // ( 0,  0,  0,  0,  0,  0,  0,  y)
45     __m128i allY = _mm_cvtsi32_si128(subY);
46 
47     // ( 0,  0,  0,  0,  y,  y,  y,  y)
48     allY = _mm_shufflelo_epi16(allY, 0);
49 
50     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
51     __m128i negY = _mm_sub_epi16(sixteen, allY);
52 
53     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
54     allY = _mm_unpacklo_epi64(allY, negY);
55 
56     // (16, 16, 16, 16, 16, 16, 16, 16 )
57     sixteen = _mm_shuffle_epi32(sixteen, 0);
58 
59     // ( 0,  0,  0,  0,  0,  0,  0,  0)
60     __m128i zero = _mm_setzero_si128();
61     do {
62         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
63         unsigned x0 = XX >> 18;
64         unsigned x1 = XX & 0x3FFF;
65 
66         // (0, 0, 0, 0, 0, 0, 0, x)
67         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
68 
69         // (0, 0, 0, 0, x, x, x, x)
70         allX = _mm_shufflelo_epi16(allX, 0);
71 
72         // (x, x, x, x, x, x, x, x)
73         allX = _mm_shuffle_epi32(allX, 0);
74 
75         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
76         __m128i negX = _mm_sub_epi16(sixteen, allX);
77 
78         // Load 4 samples (pixels).
79         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
80         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
81         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
82         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
83 
84         // (0, 0, a00, a10)
85         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
86 
87         // Expand to 16 bits per component.
88         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
89 
90         // ((a00 * (16-y)), (a10 * y)).
91         a00a10 = _mm_mullo_epi16(a00a10, allY);
92 
93         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
94         a00a10 = _mm_mullo_epi16(a00a10, negX);
95 
96         // (0, 0, a01, a10)
97         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
98 
99         // Expand to 16 bits per component.
100         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
101 
102         // (a01 * (16-y)), (a11 * y)
103         a01a11 = _mm_mullo_epi16(a01a11, allY);
104 
105         // (a01 * (16-y) * x), (a11 * y * x)
106         a01a11 = _mm_mullo_epi16(a01a11, allX);
107 
108         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
109         __m128i sum = _mm_add_epi16(a00a10, a01a11);
110 
111         // (DC, a00*w00 + a01*w01)
112         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
113 
114         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
115         sum = _mm_add_epi16(sum, shifted);
116 
117         // Divide each 16 bit component by 256.
118         sum = _mm_srli_epi16(sum, 8);
119 
120         // Pack lower 4 16 bit values of sum into lower 4 bytes.
121         sum = _mm_packus_epi16(sum, zero);
122 
123         // Extract low int and store.
124         *colors++ = _mm_cvtsi128_si32(sum);
125     } while (--count > 0);
126 }
127 
S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState & s,const uint32_t * xy,int count,uint32_t * colors)128 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
129                                   const uint32_t* xy,
130                                   int count, uint32_t* colors) {
131     SkASSERT(count > 0 && colors != NULL);
132     SkASSERT(s.fDoFilter);
133     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
134     SkASSERT(s.fAlphaScale < 256);
135 
136     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
137     unsigned rb = s.fBitmap->rowBytes();
138     uint32_t XY = *xy++;
139     unsigned y0 = XY >> 14;
140     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
141     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
142     unsigned subY = y0 & 0xF;
143 
144     // ( 0,  0,  0,  0,  0,  0,  0, 16)
145     __m128i sixteen = _mm_cvtsi32_si128(16);
146 
147     // ( 0,  0,  0,  0, 16, 16, 16, 16)
148     sixteen = _mm_shufflelo_epi16(sixteen, 0);
149 
150     // ( 0,  0,  0,  0,  0,  0,  0,  y)
151     __m128i allY = _mm_cvtsi32_si128(subY);
152 
153     // ( 0,  0,  0,  0,  y,  y,  y,  y)
154     allY = _mm_shufflelo_epi16(allY, 0);
155 
156     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
157     __m128i negY = _mm_sub_epi16(sixteen, allY);
158 
159     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
160     allY = _mm_unpacklo_epi64(allY, negY);
161 
162     // (16, 16, 16, 16, 16, 16, 16, 16 )
163     sixteen = _mm_shuffle_epi32(sixteen, 0);
164 
165     // ( 0,  0,  0,  0,  0,  0,  0,  0)
166     __m128i zero = _mm_setzero_si128();
167 
168     // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
169     __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
170 
171     do {
172         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
173         unsigned x0 = XX >> 18;
174         unsigned x1 = XX & 0x3FFF;
175 
176         // (0, 0, 0, 0, 0, 0, 0, x)
177         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
178 
179         // (0, 0, 0, 0, x, x, x, x)
180         allX = _mm_shufflelo_epi16(allX, 0);
181 
182         // (x, x, x, x, x, x, x, x)
183         allX = _mm_shuffle_epi32(allX, 0);
184 
185         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
186         __m128i negX = _mm_sub_epi16(sixteen, allX);
187 
188         // Load 4 samples (pixels).
189         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
190         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
191         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
192         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
193 
194         // (0, 0, a00, a10)
195         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
196 
197         // Expand to 16 bits per component.
198         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
199 
200         // ((a00 * (16-y)), (a10 * y)).
201         a00a10 = _mm_mullo_epi16(a00a10, allY);
202 
203         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
204         a00a10 = _mm_mullo_epi16(a00a10, negX);
205 
206         // (0, 0, a01, a10)
207         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
208 
209         // Expand to 16 bits per component.
210         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
211 
212         // (a01 * (16-y)), (a11 * y)
213         a01a11 = _mm_mullo_epi16(a01a11, allY);
214 
215         // (a01 * (16-y) * x), (a11 * y * x)
216         a01a11 = _mm_mullo_epi16(a01a11, allX);
217 
218         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
219         __m128i sum = _mm_add_epi16(a00a10, a01a11);
220 
221         // (DC, a00*w00 + a01*w01)
222         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
223 
224         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
225         sum = _mm_add_epi16(sum, shifted);
226 
227         // Divide each 16 bit component by 256.
228         sum = _mm_srli_epi16(sum, 8);
229 
230         // Multiply by alpha.
231         sum = _mm_mullo_epi16(sum, alpha);
232 
233         // Divide each 16 bit component by 256.
234         sum = _mm_srli_epi16(sum, 8);
235 
236         // Pack lower 4 16 bit values of sum into lower 4 bytes.
237         sum = _mm_packus_epi16(sum, zero);
238 
239         // Extract low int and store.
240         *colors++ = _mm_cvtsi128_si32(sum);
241     } while (--count > 0);
242 }
243