1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "build/build_config.h"
6 #include "media/base/simd/convert_rgb_to_yuv.h"
7 #include "media/base/simd/yuv_to_rgb_table.h"
8
9 #if defined(COMPILER_MSVC)
10 #include <intrin.h>
11 #else
12 #include <mmintrin.h>
13 #include <emmintrin.h>
14 #endif
15
16 namespace media {
17
18 #define FIX_SHIFT 12
19 #define FIX(x) ((x) * (1 << FIX_SHIFT))
20
21 // Define a convenient macro to do static cast.
22 #define INT16_FIX(x) static_cast<int16>(FIX(x))
23
24 // Android's pixel layout is RGBA, while other platforms
25 // are BGRA.
26 #if defined(OS_ANDROID)
27 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
28 INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
29 INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
30 -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
31 -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
32 INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
33 INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
34 };
35 #else
36 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
37 INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
38 INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
39 INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
40 INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
41 -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
42 -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
43 };
44 #endif
45
46 #undef INT16_FIX
47
48 // This is the final offset for the conversion from signed yuv values to
49 // unsigned values. It is arranged so that offset of 16 is applied to Y
50 // components and 128 is added to UV components for 2 pixels.
51 SIMD_ALIGNED(const int32 kYOffset[4]) = {16, 16, 16, 16};
52
Clamp(int value)53 static inline int Clamp(int value) {
54 if (value < 0)
55 return 0;
56 if (value > 255)
57 return 255;
58 return value;
59 }
60
RGBToY(int r,int g,int b)61 static inline int RGBToY(int r, int g, int b) {
62 int y = ConvertRGBAToYUV_kTable[0] * b +
63 ConvertRGBAToYUV_kTable[1] * g +
64 ConvertRGBAToYUV_kTable[2] * r;
65 y >>= FIX_SHIFT;
66 return Clamp(y + 16);
67 }
68
RGBToU(int r,int g,int b,int shift)69 static inline int RGBToU(int r, int g, int b, int shift) {
70 int u = ConvertRGBAToYUV_kTable[8] * b +
71 ConvertRGBAToYUV_kTable[9] * g +
72 ConvertRGBAToYUV_kTable[10] * r;
73 u >>= FIX_SHIFT + shift;
74 return Clamp(u + 128);
75 }
76
RGBToV(int r,int g,int b,int shift)77 static inline int RGBToV(int r, int g, int b, int shift) {
78 int v = ConvertRGBAToYUV_kTable[16] * b +
79 ConvertRGBAToYUV_kTable[17] * g +
80 ConvertRGBAToYUV_kTable[18] * r;
81 v >>= FIX_SHIFT + shift;
82 return Clamp(v + 128);
83 }
84
85 #define CONVERT_Y(rgb_buf, y_buf) \
86 b = *rgb_buf++; \
87 g = *rgb_buf++; \
88 r = *rgb_buf++; \
89 ++rgb_buf; \
90 sum_b += b; \
91 sum_g += g; \
92 sum_r += r; \
93 *y_buf++ = RGBToY(r, g, b);
94
ConvertRGBToYUV_V2H2(const uint8 * rgb_buf_1,const uint8 * rgb_buf_2,uint8 * y_buf_1,uint8 * y_buf_2,uint8 * u_buf,uint8 * v_buf)95 static inline void ConvertRGBToYUV_V2H2(const uint8* rgb_buf_1,
96 const uint8* rgb_buf_2,
97 uint8* y_buf_1,
98 uint8* y_buf_2,
99 uint8* u_buf,
100 uint8* v_buf) {
101 int sum_b = 0;
102 int sum_g = 0;
103 int sum_r = 0;
104 int r, g, b;
105
106
107
108 CONVERT_Y(rgb_buf_1, y_buf_1);
109 CONVERT_Y(rgb_buf_1, y_buf_1);
110 CONVERT_Y(rgb_buf_2, y_buf_2);
111 CONVERT_Y(rgb_buf_2, y_buf_2);
112 *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 2);
113 *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 2);
114 }
115
ConvertRGBToYUV_V2H1(const uint8 * rgb_buf_1,const uint8 * rgb_buf_2,uint8 * y_buf_1,uint8 * y_buf_2,uint8 * u_buf,uint8 * v_buf)116 static inline void ConvertRGBToYUV_V2H1(const uint8* rgb_buf_1,
117 const uint8* rgb_buf_2,
118 uint8* y_buf_1,
119 uint8* y_buf_2,
120 uint8* u_buf,
121 uint8* v_buf) {
122 int sum_b = 0;
123 int sum_g = 0;
124 int sum_r = 0;
125 int r, g, b;
126
127 CONVERT_Y(rgb_buf_1, y_buf_1);
128 CONVERT_Y(rgb_buf_2, y_buf_2);
129 *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
130 *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
131 }
132
ConvertRGBToYUV_V1H2(const uint8 * rgb_buf,uint8 * y_buf,uint8 * u_buf,uint8 * v_buf)133 static inline void ConvertRGBToYUV_V1H2(const uint8* rgb_buf,
134 uint8* y_buf,
135 uint8* u_buf,
136 uint8* v_buf) {
137 int sum_b = 0;
138 int sum_g = 0;
139 int sum_r = 0;
140 int r, g, b;
141
142 CONVERT_Y(rgb_buf, y_buf);
143 CONVERT_Y(rgb_buf, y_buf);
144 *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
145 *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
146 }
147
ConvertRGBToYUV_V1H1(const uint8 * rgb_buf,uint8 * y_buf,uint8 * u_buf,uint8 * v_buf)148 static inline void ConvertRGBToYUV_V1H1(const uint8* rgb_buf,
149 uint8* y_buf,
150 uint8* u_buf,
151 uint8* v_buf) {
152 int sum_b = 0;
153 int sum_g = 0;
154 int sum_r = 0;
155 int r, g, b;
156
157 CONVERT_Y(rgb_buf, y_buf);
158 *u_buf++ = RGBToU(r, g, b, 0);
159 *v_buf++ = RGBToV(r, g, b, 0);
160 }
161
ConvertRGB32ToYUVRow_SSE2(const uint8 * rgb_buf_1,const uint8 * rgb_buf_2,uint8 * y_buf_1,uint8 * y_buf_2,uint8 * u_buf,uint8 * v_buf,int width)162 static void ConvertRGB32ToYUVRow_SSE2(const uint8* rgb_buf_1,
163 const uint8* rgb_buf_2,
164 uint8* y_buf_1,
165 uint8* y_buf_2,
166 uint8* u_buf,
167 uint8* v_buf,
168 int width) {
169 while (width >= 4) {
170 // Name for the Y pixels:
171 // Row 1: a b c d
172 // Row 2: e f g h
173 //
174 // First row 4 pixels.
175 __m128i rgb_row_1 = _mm_loadu_si128(
176 reinterpret_cast<const __m128i*>(rgb_buf_1));
177 __m128i zero_1 = _mm_xor_si128(rgb_row_1, rgb_row_1);
178
179 __m128i y_table = _mm_load_si128(
180 reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable));
181
182 __m128i rgb_a_b = _mm_unpackhi_epi8(rgb_row_1, zero_1);
183 rgb_a_b = _mm_madd_epi16(rgb_a_b, y_table);
184
185 __m128i rgb_c_d = _mm_unpacklo_epi8(rgb_row_1, zero_1);
186 rgb_c_d = _mm_madd_epi16(rgb_c_d, y_table);
187
188 // Do a crazh shuffle so that we get:
189 // v------------ Multiply Add
190 // BG: a b c d
191 // A0: a b c d
192 __m128i bg_abcd = _mm_castps_si128(
193 _mm_shuffle_ps(
194 _mm_castsi128_ps(rgb_c_d),
195 _mm_castsi128_ps(rgb_a_b),
196 (3 << 6) | (1 << 4) | (3 << 2) | 1));
197 __m128i r_abcd = _mm_castps_si128(
198 _mm_shuffle_ps(
199 _mm_castsi128_ps(rgb_c_d),
200 _mm_castsi128_ps(rgb_a_b),
201 (2 << 6) | (2 << 2)));
202 __m128i y_abcd = _mm_add_epi32(bg_abcd, r_abcd);
203
204 // Down shift back to 8bits range.
205 __m128i y_offset = _mm_load_si128(
206 reinterpret_cast<const __m128i*>(kYOffset));
207 y_abcd = _mm_srai_epi32(y_abcd, FIX_SHIFT);
208 y_abcd = _mm_add_epi32(y_abcd, y_offset);
209 y_abcd = _mm_packs_epi32(y_abcd, y_abcd);
210 y_abcd = _mm_packus_epi16(y_abcd, y_abcd);
211 *reinterpret_cast<uint32*>(y_buf_1) = _mm_cvtsi128_si32(y_abcd);
212 y_buf_1 += 4;
213
214 // Second row 4 pixels.
215 __m128i rgb_row_2 = _mm_loadu_si128(
216 reinterpret_cast<const __m128i*>(rgb_buf_2));
217 __m128i zero_2 = _mm_xor_si128(rgb_row_2, rgb_row_2);
218 __m128i rgb_e_f = _mm_unpackhi_epi8(rgb_row_2, zero_2);
219 __m128i rgb_g_h = _mm_unpacklo_epi8(rgb_row_2, zero_2);
220
221 // Add two rows together.
222 __m128i rgb_ae_bf =
223 _mm_add_epi16(_mm_unpackhi_epi8(rgb_row_1, zero_2), rgb_e_f);
224 __m128i rgb_cg_dh =
225 _mm_add_epi16(_mm_unpacklo_epi8(rgb_row_1, zero_2), rgb_g_h);
226
227 // Multiply add like the previous row.
228 rgb_e_f = _mm_madd_epi16(rgb_e_f, y_table);
229 rgb_g_h = _mm_madd_epi16(rgb_g_h, y_table);
230
231 __m128i bg_efgh = _mm_castps_si128(
232 _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
233 _mm_castsi128_ps(rgb_e_f),
234 (3 << 6) | (1 << 4) | (3 << 2) | 1));
235 __m128i r_efgh = _mm_castps_si128(
236 _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
237 _mm_castsi128_ps(rgb_e_f),
238 (2 << 6) | (2 << 2)));
239 __m128i y_efgh = _mm_add_epi32(bg_efgh, r_efgh);
240 y_efgh = _mm_srai_epi32(y_efgh, FIX_SHIFT);
241 y_efgh = _mm_add_epi32(y_efgh, y_offset);
242 y_efgh = _mm_packs_epi32(y_efgh, y_efgh);
243 y_efgh = _mm_packus_epi16(y_efgh, y_efgh);
244 *reinterpret_cast<uint32*>(y_buf_2) = _mm_cvtsi128_si32(y_efgh);
245 y_buf_2 += 4;
246
247 __m128i rgb_ae_cg = _mm_castps_si128(
248 _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
249 _mm_castsi128_ps(rgb_ae_bf),
250 (3 << 6) | (2 << 4) | (3 << 2) | 2));
251 __m128i rgb_bf_dh = _mm_castps_si128(
252 _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
253 _mm_castsi128_ps(rgb_ae_bf),
254 (1 << 6) | (1 << 2)));
255
256 // This is a 2x2 subsampling for 2 pixels.
257 __m128i rgb_abef_cdgh = _mm_add_epi16(rgb_ae_cg, rgb_bf_dh);
258
259 // Do a multiply add with U table.
260 __m128i u_a_b = _mm_madd_epi16(
261 rgb_abef_cdgh,
262 _mm_load_si128(
263 reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 8)));
264 u_a_b = _mm_add_epi32(_mm_shuffle_epi32(u_a_b, ((3 << 2) | 1)),
265 _mm_shuffle_epi32(u_a_b, (2 << 2)));
266 // Right shift 14 because of 12 from fixed point and 2 from subsampling.
267 u_a_b = _mm_srai_epi32(u_a_b, FIX_SHIFT + 2);
268 __m128i uv_offset = _mm_slli_epi32(y_offset, 3);
269 u_a_b = _mm_add_epi32(u_a_b, uv_offset);
270 u_a_b = _mm_packs_epi32(u_a_b, u_a_b);
271 u_a_b = _mm_packus_epi16(u_a_b, u_a_b);
272 *reinterpret_cast<uint16*>(u_buf) = _mm_extract_epi16(u_a_b, 0);
273 u_buf += 2;
274
275 __m128i v_a_b = _mm_madd_epi16(
276 rgb_abef_cdgh,
277 _mm_load_si128(
278 reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 16)));
279 v_a_b = _mm_add_epi32(_mm_shuffle_epi32(v_a_b, ((3 << 2) | 1)),
280 _mm_shuffle_epi32(v_a_b, (2 << 2)));
281 v_a_b = _mm_srai_epi32(v_a_b, FIX_SHIFT + 2);
282 v_a_b = _mm_add_epi32(v_a_b, uv_offset);
283 v_a_b = _mm_packs_epi32(v_a_b, v_a_b);
284 v_a_b = _mm_packus_epi16(v_a_b, v_a_b);
285 *reinterpret_cast<uint16*>(v_buf) = _mm_extract_epi16(v_a_b, 0);
286 v_buf += 2;
287
288 rgb_buf_1 += 16;
289 rgb_buf_2 += 16;
290
291 // Move forward by 4 pixels.
292 width -= 4;
293 }
294
295 // Just use C code to convert the remaining pixels.
296 if (width >= 2) {
297 ConvertRGBToYUV_V2H2(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
298 rgb_buf_1 += 8;
299 rgb_buf_2 += 8;
300 y_buf_1 += 2;
301 y_buf_2 += 2;
302 ++u_buf;
303 ++v_buf;
304 width -= 2;
305 }
306
307 if (width)
308 ConvertRGBToYUV_V2H1(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
309 }
310
ConvertRGB32ToYUV_SSE2(const uint8 * rgbframe,uint8 * yplane,uint8 * uplane,uint8 * vplane,int width,int height,int rgbstride,int ystride,int uvstride)311 extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe,
312 uint8* yplane,
313 uint8* uplane,
314 uint8* vplane,
315 int width,
316 int height,
317 int rgbstride,
318 int ystride,
319 int uvstride) {
320 while (height >= 2) {
321 ConvertRGB32ToYUVRow_SSE2(rgbframe,
322 rgbframe + rgbstride,
323 yplane,
324 yplane + ystride,
325 uplane,
326 vplane,
327 width);
328 rgbframe += 2 * rgbstride;
329 yplane += 2 * ystride;
330 uplane += uvstride;
331 vplane += uvstride;
332 height -= 2;
333 }
334
335 if (!height)
336 return;
337
338 // Handle the last row.
339 while (width >= 2) {
340 ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
341 rgbframe += 8;
342 yplane += 2;
343 ++uplane;
344 ++vplane;
345 width -= 2;
346 }
347
348 if (width)
349 ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
350 }
351
ConvertRGB32ToYUV_SSE2_Reference(const uint8 * rgbframe,uint8 * yplane,uint8 * uplane,uint8 * vplane,int width,int height,int rgbstride,int ystride,int uvstride)352 void ConvertRGB32ToYUV_SSE2_Reference(const uint8* rgbframe,
353 uint8* yplane,
354 uint8* uplane,
355 uint8* vplane,
356 int width,
357 int height,
358 int rgbstride,
359 int ystride,
360 int uvstride) {
361 while (height >= 2) {
362 int i = 0;
363
364 // Convert a 2x2 block.
365 while (i + 2 <= width) {
366 ConvertRGBToYUV_V2H2(rgbframe + i * 4,
367 rgbframe + rgbstride + i * 4,
368 yplane + i,
369 yplane + ystride + i,
370 uplane + i / 2,
371 vplane + i / 2);
372 i += 2;
373 }
374
375 // Convert the last pixel of two rows.
376 if (i < width) {
377 ConvertRGBToYUV_V2H1(rgbframe + i * 4,
378 rgbframe + rgbstride + i * 4,
379 yplane + i,
380 yplane + ystride + i,
381 uplane + i / 2,
382 vplane + i / 2);
383 }
384
385 rgbframe += 2 * rgbstride;
386 yplane += 2 * ystride;
387 uplane += uvstride;
388 vplane += uvstride;
389 height -= 2;
390 }
391
392 if (!height)
393 return;
394
395 // Handle the last row.
396 while (width >= 2) {
397 ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
398 rgbframe += 8;
399 yplane += 2;
400 ++uplane;
401 ++vplane;
402 width -= 2;
403 }
404
405 // Handle the last pixel in the last row.
406 if (width)
407 ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
408 }
409
410 } // namespace media
411