• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #include <assert.h>
14 #include <string.h>  // For memcpy and memset.
15 
16 #include "libyuv/basic_types.h"
17 #include "libyuv/convert_argb.h"  // For kYuvI601Constants
18 
19 #ifdef __cplusplus
20 namespace libyuv {
21 extern "C" {
22 #endif
23 
24 // This macro controls YUV to RGB using unsigned math to extend range of
25 // YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
26 // LIBYUV_UNLIMITED_DATA
27 
28 // Macros to enable unlimited data for each colorspace
29 // LIBYUV_UNLIMITED_BT601
30 // LIBYUV_UNLIMITED_BT709
31 // LIBYUV_UNLIMITED_BT2020
32 
33 // The following macro from row_win makes the C code match the row_win code,
34 // which is 7 bit fixed point for ARGBToI420:
35 #if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \
36     defined(_MSC_VER) && !defined(__clang__) &&                   \
37     (defined(_M_IX86) || defined(_M_X64))
38 #define LIBYUV_RGB7 1
39 #endif
40 
41 #if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \
42                                    defined(__i386__) || defined(_M_IX86))
43 #define LIBYUV_ARGBTOUV_PAVGB 1
44 #define LIBYUV_RGBTOU_TRUNCATE 1
45 #define LIBYUV_ATTENUATE_DUP 1
46 #endif
47 #if defined(LIBYUV_BIT_EXACT)
48 #define LIBYUV_UNATTENUATE_DUP 1
49 #endif
50 
51 // llvm x86 is poor at ternary operator, so use branchless min/max.
52 
53 #define USE_BRANCHLESS 1
54 #if USE_BRANCHLESS
clamp0(int32_t v)55 static __inline int32_t clamp0(int32_t v) {
56   return -(v >= 0) & v;
57 }
58 // TODO(fbarchard): make clamp255 preserve negative values.
clamp255(int32_t v)59 static __inline int32_t clamp255(int32_t v) {
60   return (-(v >= 255) | v) & 255;
61 }
62 
clamp1023(int32_t v)63 static __inline int32_t clamp1023(int32_t v) {
64   return (-(v >= 1023) | v) & 1023;
65 }
66 
67 // clamp to max
ClampMax(int32_t v,int32_t max)68 static __inline int32_t ClampMax(int32_t v, int32_t max) {
69   return (-(v >= max) | v) & max;
70 }
71 
Abs(int32_t v)72 static __inline uint32_t Abs(int32_t v) {
73   int m = -(v < 0);
74   return (v + m) ^ m;
75 }
76 #else   // USE_BRANCHLESS
77 static __inline int32_t clamp0(int32_t v) {
78   return (v < 0) ? 0 : v;
79 }
80 
81 static __inline int32_t clamp255(int32_t v) {
82   return (v > 255) ? 255 : v;
83 }
84 
85 static __inline int32_t clamp1023(int32_t v) {
86   return (v > 1023) ? 1023 : v;
87 }
88 
89 static __inline int32_t ClampMax(int32_t v, int32_t max) {
90   return (v > max) ? max : v;
91 }
92 
93 static __inline uint32_t Abs(int32_t v) {
94   return (v < 0) ? -v : v;
95 }
96 #endif  // USE_BRANCHLESS
Clamp(int32_t val)97 static __inline uint32_t Clamp(int32_t val) {
98   int v = clamp0(val);
99   return (uint32_t)(clamp255(v));
100 }
101 
Clamp10(int32_t val)102 static __inline uint32_t Clamp10(int32_t val) {
103   int v = clamp0(val);
104   return (uint32_t)(clamp1023(v));
105 }
106 
107 // Little Endian
108 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
109     defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) ||     \
110     (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
111 #define WRITEWORD(p, v) *(uint32_t*)(p) = v
112 #else
WRITEWORD(uint8_t * p,uint32_t v)113 static inline void WRITEWORD(uint8_t* p, uint32_t v) {
114   p[0] = (uint8_t)(v & 255);
115   p[1] = (uint8_t)((v >> 8) & 255);
116   p[2] = (uint8_t)((v >> 16) & 255);
117   p[3] = (uint8_t)((v >> 24) & 255);
118 }
119 #endif
120 
RGB24ToARGBRow_C(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)121 void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
122   int x;
123   for (x = 0; x < width; ++x) {
124     uint8_t b = src_rgb24[0];
125     uint8_t g = src_rgb24[1];
126     uint8_t r = src_rgb24[2];
127     dst_argb[0] = b;
128     dst_argb[1] = g;
129     dst_argb[2] = r;
130     dst_argb[3] = 255u;
131     dst_argb += 4;
132     src_rgb24 += 3;
133   }
134 }
135 
RAWToARGBRow_C(const uint8_t * src_raw,uint8_t * dst_argb,int width)136 void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
137   int x;
138   for (x = 0; x < width; ++x) {
139     uint8_t r = src_raw[0];
140     uint8_t g = src_raw[1];
141     uint8_t b = src_raw[2];
142     dst_argb[0] = b;
143     dst_argb[1] = g;
144     dst_argb[2] = r;
145     dst_argb[3] = 255u;
146     dst_argb += 4;
147     src_raw += 3;
148   }
149 }
150 
RAWToRGBARow_C(const uint8_t * src_raw,uint8_t * dst_rgba,int width)151 void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
152   int x;
153   for (x = 0; x < width; ++x) {
154     uint8_t r = src_raw[0];
155     uint8_t g = src_raw[1];
156     uint8_t b = src_raw[2];
157     dst_rgba[0] = 255u;
158     dst_rgba[1] = b;
159     dst_rgba[2] = g;
160     dst_rgba[3] = r;
161     dst_rgba += 4;
162     src_raw += 3;
163   }
164 }
165 
RAWToRGB24Row_C(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)166 void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
167   int x;
168   for (x = 0; x < width; ++x) {
169     uint8_t r = src_raw[0];
170     uint8_t g = src_raw[1];
171     uint8_t b = src_raw[2];
172     dst_rgb24[0] = b;
173     dst_rgb24[1] = g;
174     dst_rgb24[2] = r;
175     dst_rgb24 += 3;
176     src_raw += 3;
177   }
178 }
179 
RGB565ToARGBRow_C(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)180 void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
181                        uint8_t* dst_argb,
182                        int width) {
183   int x;
184   for (x = 0; x < width; ++x) {
185     uint8_t b = src_rgb565[0] & 0x1f;
186     uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
187     uint8_t r = src_rgb565[1] >> 3;
188     dst_argb[0] = (b << 3) | (b >> 2);
189     dst_argb[1] = (g << 2) | (g >> 4);
190     dst_argb[2] = (r << 3) | (r >> 2);
191     dst_argb[3] = 255u;
192     dst_argb += 4;
193     src_rgb565 += 2;
194   }
195 }
196 
ARGB1555ToARGBRow_C(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)197 void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
198                          uint8_t* dst_argb,
199                          int width) {
200   int x;
201   for (x = 0; x < width; ++x) {
202     uint8_t b = src_argb1555[0] & 0x1f;
203     uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
204     uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
205     uint8_t a = src_argb1555[1] >> 7;
206     dst_argb[0] = (b << 3) | (b >> 2);
207     dst_argb[1] = (g << 3) | (g >> 2);
208     dst_argb[2] = (r << 3) | (r >> 2);
209     dst_argb[3] = -a;
210     dst_argb += 4;
211     src_argb1555 += 2;
212   }
213 }
214 
ARGB4444ToARGBRow_C(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)215 void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
216                          uint8_t* dst_argb,
217                          int width) {
218   int x;
219   for (x = 0; x < width; ++x) {
220     uint8_t b = src_argb4444[0] & 0x0f;
221     uint8_t g = src_argb4444[0] >> 4;
222     uint8_t r = src_argb4444[1] & 0x0f;
223     uint8_t a = src_argb4444[1] >> 4;
224     dst_argb[0] = (b << 4) | b;
225     dst_argb[1] = (g << 4) | g;
226     dst_argb[2] = (r << 4) | r;
227     dst_argb[3] = (a << 4) | a;
228     dst_argb += 4;
229     src_argb4444 += 2;
230   }
231 }
232 
AR30ToARGBRow_C(const uint8_t * src_ar30,uint8_t * dst_argb,int width)233 void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
234   int x;
235   for (x = 0; x < width; ++x) {
236     uint32_t ar30;
237     memcpy(&ar30, src_ar30, sizeof ar30);
238     uint32_t b = (ar30 >> 2) & 0xff;
239     uint32_t g = (ar30 >> 12) & 0xff;
240     uint32_t r = (ar30 >> 22) & 0xff;
241     uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
242     *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
243     dst_argb += 4;
244     src_ar30 += 4;
245   }
246 }
247 
AR30ToABGRRow_C(const uint8_t * src_ar30,uint8_t * dst_abgr,int width)248 void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
249   int x;
250   for (x = 0; x < width; ++x) {
251     uint32_t ar30;
252     memcpy(&ar30, src_ar30, sizeof ar30);
253     uint32_t b = (ar30 >> 2) & 0xff;
254     uint32_t g = (ar30 >> 12) & 0xff;
255     uint32_t r = (ar30 >> 22) & 0xff;
256     uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
257     *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
258     dst_abgr += 4;
259     src_ar30 += 4;
260   }
261 }
262 
AR30ToAB30Row_C(const uint8_t * src_ar30,uint8_t * dst_ab30,int width)263 void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
264   int x;
265   for (x = 0; x < width; ++x) {
266     uint32_t ar30;
267     memcpy(&ar30, src_ar30, sizeof ar30);
268     uint32_t b = ar30 & 0x3ff;
269     uint32_t ga = ar30 & 0xc00ffc00;
270     uint32_t r = (ar30 >> 20) & 0x3ff;
271     *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
272     dst_ab30 += 4;
273     src_ar30 += 4;
274   }
275 }
276 
ARGBToRGB24Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)277 void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
278   int x;
279   for (x = 0; x < width; ++x) {
280     uint8_t b = src_argb[0];
281     uint8_t g = src_argb[1];
282     uint8_t r = src_argb[2];
283     dst_rgb[0] = b;
284     dst_rgb[1] = g;
285     dst_rgb[2] = r;
286     dst_rgb += 3;
287     src_argb += 4;
288   }
289 }
290 
ARGBToRAWRow_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)291 void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
292   int x;
293   for (x = 0; x < width; ++x) {
294     uint8_t b = src_argb[0];
295     uint8_t g = src_argb[1];
296     uint8_t r = src_argb[2];
297     dst_rgb[0] = r;
298     dst_rgb[1] = g;
299     dst_rgb[2] = b;
300     dst_rgb += 3;
301     src_argb += 4;
302   }
303 }
304 
ARGBToRGB565Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)305 void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
306   int x;
307   for (x = 0; x < width - 1; x += 2) {
308     uint8_t b0 = src_argb[0] >> 3;
309     uint8_t g0 = src_argb[1] >> 2;
310     uint8_t r0 = src_argb[2] >> 3;
311     uint8_t b1 = src_argb[4] >> 3;
312     uint8_t g1 = src_argb[5] >> 2;
313     uint8_t r1 = src_argb[6] >> 3;
314     WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
315                            (r1 << 27));
316     dst_rgb += 4;
317     src_argb += 8;
318   }
319   if (width & 1) {
320     uint8_t b0 = src_argb[0] >> 3;
321     uint8_t g0 = src_argb[1] >> 2;
322     uint8_t r0 = src_argb[2] >> 3;
323     *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
324   }
325 }
326 
327 // dither4 is a row of 4 values from 4x4 dither matrix.
328 // The 4x4 matrix contains values to increase RGB.  When converting to
329 // fewer bits (565) this provides an ordered dither.
330 // The order in the 4x4 matrix in first byte is upper left.
331 // The 4 values are passed as an int, then referenced as an array, so
332 // endian will not affect order of the original matrix.  But the dither4
333 // will containing the first pixel in the lower byte for little endian
334 // or the upper byte for big endian.
ARGBToRGB565DitherRow_C(const uint8_t * src_argb,uint8_t * dst_rgb,const uint32_t dither4,int width)335 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
336                              uint8_t* dst_rgb,
337                              const uint32_t dither4,
338                              int width) {
339   int x;
340   for (x = 0; x < width - 1; x += 2) {
341     int dither0 = ((const unsigned char*)(&dither4))[x & 3];
342     int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
343     uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
344     uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
345     uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
346     uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
347     uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
348     uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
349     *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 11);
350     *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 11);
351     dst_rgb += 4;
352     src_argb += 8;
353   }
354   if (width & 1) {
355     int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
356     uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
357     uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
358     uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
359     *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
360   }
361 }
362 
ARGBToARGB1555Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)363 void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
364   int x;
365   for (x = 0; x < width - 1; x += 2) {
366     uint8_t b0 = src_argb[0] >> 3;
367     uint8_t g0 = src_argb[1] >> 3;
368     uint8_t r0 = src_argb[2] >> 3;
369     uint8_t a0 = src_argb[3] >> 7;
370     uint8_t b1 = src_argb[4] >> 3;
371     uint8_t g1 = src_argb[5] >> 3;
372     uint8_t r1 = src_argb[6] >> 3;
373     uint8_t a1 = src_argb[7] >> 7;
374     *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
375     *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 10) | (a1 << 15);
376     dst_rgb += 4;
377     src_argb += 8;
378   }
379   if (width & 1) {
380     uint8_t b0 = src_argb[0] >> 3;
381     uint8_t g0 = src_argb[1] >> 3;
382     uint8_t r0 = src_argb[2] >> 3;
383     uint8_t a0 = src_argb[3] >> 7;
384     *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
385   }
386 }
387 
ARGBToARGB4444Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)388 void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
389   int x;
390   for (x = 0; x < width - 1; x += 2) {
391     uint8_t b0 = src_argb[0] >> 4;
392     uint8_t g0 = src_argb[1] >> 4;
393     uint8_t r0 = src_argb[2] >> 4;
394     uint8_t a0 = src_argb[3] >> 4;
395     uint8_t b1 = src_argb[4] >> 4;
396     uint8_t g1 = src_argb[5] >> 4;
397     uint8_t r1 = src_argb[6] >> 4;
398     uint8_t a1 = src_argb[7] >> 4;
399     *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
400     *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 4) | (r1 << 8) | (a1 << 12);
401     dst_rgb += 4;
402     src_argb += 8;
403   }
404   if (width & 1) {
405     uint8_t b0 = src_argb[0] >> 4;
406     uint8_t g0 = src_argb[1] >> 4;
407     uint8_t r0 = src_argb[2] >> 4;
408     uint8_t a0 = src_argb[3] >> 4;
409     *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
410   }
411 }
412 
ABGRToAR30Row_C(const uint8_t * src_abgr,uint8_t * dst_ar30,int width)413 void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
414   int x;
415   for (x = 0; x < width; ++x) {
416     uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
417     uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
418     uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
419     uint32_t a0 = (src_abgr[3] >> 6);
420     *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
421     dst_ar30 += 4;
422     src_abgr += 4;
423   }
424 }
425 
ARGBToAR30Row_C(const uint8_t * src_argb,uint8_t * dst_ar30,int width)426 void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
427   int x;
428   for (x = 0; x < width; ++x) {
429     uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
430     uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
431     uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
432     uint32_t a0 = (src_argb[3] >> 6);
433     *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
434     dst_ar30 += 4;
435     src_argb += 4;
436   }
437 }
438 
ARGBToAR64Row_C(const uint8_t * src_argb,uint16_t * dst_ar64,int width)439 void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
440   int x;
441   for (x = 0; x < width; ++x) {
442     dst_ar64[0] = src_argb[0] * 0x0101;
443     dst_ar64[1] = src_argb[1] * 0x0101;
444     dst_ar64[2] = src_argb[2] * 0x0101;
445     dst_ar64[3] = src_argb[3] * 0x0101;
446     dst_ar64 += 4;
447     src_argb += 4;
448   }
449 }
450 
ARGBToAB64Row_C(const uint8_t * src_argb,uint16_t * dst_ab64,int width)451 void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
452   int x;
453   for (x = 0; x < width; ++x) {
454     dst_ab64[0] = src_argb[2] * 0x0101;
455     dst_ab64[1] = src_argb[1] * 0x0101;
456     dst_ab64[2] = src_argb[0] * 0x0101;
457     dst_ab64[3] = src_argb[3] * 0x0101;
458     dst_ab64 += 4;
459     src_argb += 4;
460   }
461 }
462 
AR64ToARGBRow_C(const uint16_t * src_ar64,uint8_t * dst_argb,int width)463 void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
464   int x;
465   for (x = 0; x < width; ++x) {
466     dst_argb[0] = src_ar64[0] >> 8;
467     dst_argb[1] = src_ar64[1] >> 8;
468     dst_argb[2] = src_ar64[2] >> 8;
469     dst_argb[3] = src_ar64[3] >> 8;
470     dst_argb += 4;
471     src_ar64 += 4;
472   }
473 }
474 
AB64ToARGBRow_C(const uint16_t * src_ab64,uint8_t * dst_argb,int width)475 void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
476   int x;
477   for (x = 0; x < width; ++x) {
478     dst_argb[0] = src_ab64[2] >> 8;
479     dst_argb[1] = src_ab64[1] >> 8;
480     dst_argb[2] = src_ab64[0] >> 8;
481     dst_argb[3] = src_ab64[3] >> 8;
482     dst_argb += 4;
483     src_ab64 += 4;
484   }
485 }
486 
487 // TODO(fbarchard): Make shuffle compatible with SIMD versions
AR64ShuffleRow_C(const uint8_t * src_ar64,uint8_t * dst_ar64,const uint8_t * shuffler,int width)488 void AR64ShuffleRow_C(const uint8_t* src_ar64,
489                       uint8_t* dst_ar64,
490                       const uint8_t* shuffler,
491                       int width) {
492   const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64;
493   uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64;
494   int index0 = shuffler[0] / 2;
495   int index1 = shuffler[2] / 2;
496   int index2 = shuffler[4] / 2;
497   int index3 = shuffler[6] / 2;
498   // Shuffle a row of AR64.
499   int x;
500   for (x = 0; x < width / 2; ++x) {
501     // To support in-place conversion.
502     uint16_t b = src_ar64_16[index0];
503     uint16_t g = src_ar64_16[index1];
504     uint16_t r = src_ar64_16[index2];
505     uint16_t a = src_ar64_16[index3];
506     dst_ar64_16[0] = b;
507     dst_ar64_16[1] = g;
508     dst_ar64_16[2] = r;
509     dst_ar64_16[3] = a;
510     src_ar64_16 += 4;
511     dst_ar64_16 += 4;
512   }
513 }
514 
515 #ifdef LIBYUV_RGB7
516 // Old 7 bit math for compatibility on unsupported platforms.
RGBToY(uint8_t r,uint8_t g,uint8_t b)517 static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
518   return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
519 }
520 #else
521 // 8 bit
522 // Intel SSE/AVX uses the following equivalent formula
523 // 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
524 //  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
525 //  0x7e80) >> 8;
526 
RGBToY(uint8_t r,uint8_t g,uint8_t b)527 static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
528   return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
529 }
530 #endif
531 
532 #define AVGB(a, b) (((a) + (b) + 1) >> 1)
533 
534 // LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round.
535 #ifdef LIBYUV_RGBTOU_TRUNCATE
RGBToU(uint8_t r,uint8_t g,uint8_t b)536 static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
537   return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
538 }
RGBToV(uint8_t r,uint8_t g,uint8_t b)539 static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
540   return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
541 }
542 #else
543 // TODO(fbarchard): Add rounding to x86 SIMD and use this
RGBToU(uint8_t r,uint8_t g,uint8_t b)544 static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
545   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
546 }
RGBToV(uint8_t r,uint8_t g,uint8_t b)547 static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
548   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
549 }
550 #endif
551 
552 // LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb.
553 #if !defined(LIBYUV_ARGBTOUV_PAVGB)
RGB2xToU(uint16_t r,uint16_t g,uint16_t b)554 static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
555   return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
556 }
RGB2xToV(uint16_t r,uint16_t g,uint16_t b)557 static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
558   return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
559 }
560 #endif
561 
562 // ARGBToY_C and ARGBToUV_C
563 // Intel version mimic SSE/AVX which does 2 pavgb
564 #if LIBYUV_ARGBTOUV_PAVGB
565 #define MAKEROWY(NAME, R, G, B, BPP)                                       \
566   void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
567     int x;                                                                 \
568     for (x = 0; x < width; ++x) {                                          \
569       dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
570       src_rgb += BPP;                                                      \
571       dst_y += 1;                                                          \
572     }                                                                      \
573   }                                                                        \
574   void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
575                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
576     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
577     int x;                                                                 \
578     for (x = 0; x < width - 1; x += 2) {                                   \
579       uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                     \
580                         AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));        \
581       uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                     \
582                         AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));        \
583       uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                     \
584                         AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));        \
585       dst_u[0] = RGBToU(ar, ag, ab);                                       \
586       dst_v[0] = RGBToV(ar, ag, ab);                                       \
587       src_rgb += BPP * 2;                                                  \
588       src_rgb1 += BPP * 2;                                                 \
589       dst_u += 1;                                                          \
590       dst_v += 1;                                                          \
591     }                                                                      \
592     if (width & 1) {                                                       \
593       uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                          \
594       uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                          \
595       uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                          \
596       dst_u[0] = RGBToU(ar, ag, ab);                                       \
597       dst_v[0] = RGBToV(ar, ag, ab);                                       \
598     }                                                                      \
599   }
600 #else
601 // ARM version does sum / 2 then multiply by 2x smaller coefficients
602 #define MAKEROWY(NAME, R, G, B, BPP)                                       \
603   void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
604     int x;                                                                 \
605     for (x = 0; x < width; ++x) {                                          \
606       dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
607       src_rgb += BPP;                                                      \
608       dst_y += 1;                                                          \
609     }                                                                      \
610   }                                                                        \
611   void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
612                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
613     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
614     int x;                                                                 \
615     for (x = 0; x < width - 1; x += 2) {                                   \
616       uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +         \
617                      src_rgb1[B + BPP] + 1) >>                             \
618                     1;                                                     \
619       uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +         \
620                      src_rgb1[G + BPP] + 1) >>                             \
621                     1;                                                     \
622       uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +         \
623                      src_rgb1[R + BPP] + 1) >>                             \
624                     1;                                                     \
625       dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
626       dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
627       src_rgb += BPP * 2;                                                  \
628       src_rgb1 += BPP * 2;                                                 \
629       dst_u += 1;                                                          \
630       dst_v += 1;                                                          \
631     }                                                                      \
632     if (width & 1) {                                                       \
633       uint16_t ab = src_rgb[B] + src_rgb1[B];                              \
634       uint16_t ag = src_rgb[G] + src_rgb1[G];                              \
635       uint16_t ar = src_rgb[R] + src_rgb1[R];                              \
636       dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
637       dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
638     }                                                                      \
639   }
640 #endif
641 
642 MAKEROWY(ARGB, 2, 1, 0, 4)
643 MAKEROWY(BGRA, 1, 2, 3, 4)
644 MAKEROWY(ABGR, 0, 1, 2, 4)
645 MAKEROWY(RGBA, 3, 2, 1, 4)
646 MAKEROWY(RGB24, 2, 1, 0, 3)
647 MAKEROWY(RAW, 0, 1, 2, 3)
648 #undef MAKEROWY
649 
650 // JPeg uses a variation on BT.601-1 full range
651 // y =  0.29900 * r + 0.58700 * g + 0.11400 * b
652 // u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
653 // v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
654 // BT.601 Mpeg range uses:
655 // b 0.1016 * 255 = 25.908 = 25
656 // g 0.5078 * 255 = 129.489 = 129
657 // r 0.2578 * 255 = 65.739 = 66
658 // JPeg 7 bit Y (deprecated)
659 // b 0.11400 * 128 = 14.592 = 15
660 // g 0.58700 * 128 = 75.136 = 75
661 // r 0.29900 * 128 = 38.272 = 38
662 // JPeg 8 bit Y:
663 // b 0.11400 * 256 = 29.184 = 29
664 // g 0.58700 * 256 = 150.272 = 150
665 // r 0.29900 * 256 = 76.544 = 77
666 // JPeg 8 bit U:
667 // b  0.50000 * 255 = 127.5 = 127
668 // g -0.33126 * 255 = -84.4713 = -84
669 // r -0.16874 * 255 = -43.0287 = -43
670 // JPeg 8 bit V:
671 // b -0.08131 * 255 = -20.73405 = -20
672 // g -0.41869 * 255 = -106.76595 = -107
673 // r  0.50000 * 255 = 127.5 = 127
674 
675 #ifdef LIBYUV_RGB7
676 // Old 7 bit math for compatibility on unsupported platforms.
RGBToYJ(uint8_t r,uint8_t g,uint8_t b)677 static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
678   return (38 * r + 75 * g + 15 * b + 64) >> 7;
679 }
680 #else
681 // 8 bit
682 static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
683   return (77 * r + 150 * g + 29 * b + 128) >> 8;
684 }
685 #endif
686 
687 #if defined(LIBYUV_ARGBTOUV_PAVGB)
RGBToUJ(uint8_t r,uint8_t g,uint8_t b)688 static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
689   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
690 }
RGBToVJ(uint8_t r,uint8_t g,uint8_t b)691 static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
692   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
693 }
694 #else
RGB2xToUJ(uint16_t r,uint16_t g,uint16_t b)695 static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
696   return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
697 }
RGB2xToVJ(uint16_t r,uint16_t g,uint16_t b)698 static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
699   return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
700 }
701 #endif
702 
703 // ARGBToYJ_C and ARGBToUVJ_C
704 // Intel version mimic SSE/AVX which does 2 pavgb
705 #if LIBYUV_ARGBTOUV_PAVGB
706 #define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
707   void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
708     int x;                                                                  \
709     for (x = 0; x < width; ++x) {                                           \
710       dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
711       src_rgb += BPP;                                                       \
712       dst_y += 1;                                                           \
713     }                                                                       \
714   }                                                                         \
715   void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
716                         uint8_t* dst_u, uint8_t* dst_v, int width) {        \
717     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
718     int x;                                                                  \
719     for (x = 0; x < width - 1; x += 2) {                                    \
720       uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                      \
721                         AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));         \
722       uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                      \
723                         AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));         \
724       uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                      \
725                         AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));         \
726       dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
727       dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
728       src_rgb += BPP * 2;                                                   \
729       src_rgb1 += BPP * 2;                                                  \
730       dst_u += 1;                                                           \
731       dst_v += 1;                                                           \
732     }                                                                       \
733     if (width & 1) {                                                        \
734       uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                           \
735       uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                           \
736       uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                           \
737       dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
738       dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
739     }                                                                       \
740   }
741 #else
742 // ARM version does sum / 2 then multiply by 2x smaller coefficients
743 #define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
744   void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
745     int x;                                                                  \
746     for (x = 0; x < width; ++x) {                                           \
747       dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
748       src_rgb += BPP;                                                       \
749       dst_y += 1;                                                           \
750     }                                                                       \
751   }                                                                         \
752   void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
753                         uint8_t* dst_u, uint8_t* dst_v, int width) {        \
754     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
755     int x;                                                                  \
756     for (x = 0; x < width - 1; x += 2) {                                    \
757       uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +          \
758                      src_rgb1[B + BPP] + 1) >>                              \
759                     1;                                                      \
760       uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +          \
761                      src_rgb1[G + BPP] + 1) >>                              \
762                     1;                                                      \
763       uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +          \
764                      src_rgb1[R + BPP] + 1) >>                              \
765                     1;                                                      \
766       dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
767       dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
768       src_rgb += BPP * 2;                                                   \
769       src_rgb1 += BPP * 2;                                                  \
770       dst_u += 1;                                                           \
771       dst_v += 1;                                                           \
772     }                                                                       \
773     if (width & 1) {                                                        \
774       uint16_t ab = (src_rgb[B] + src_rgb1[B]);                             \
775       uint16_t ag = (src_rgb[G] + src_rgb1[G]);                             \
776       uint16_t ar = (src_rgb[R] + src_rgb1[R]);                             \
777       dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
778       dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
779     }                                                                       \
780   }
781 
782 #endif
783 
784 MAKEROWYJ(ARGB, 2, 1, 0, 4)
785 MAKEROWYJ(RGBA, 3, 2, 1, 4)
786 MAKEROWYJ(RGB24, 2, 1, 0, 3)
787 MAKEROWYJ(RAW, 0, 1, 2, 3)
788 #undef MAKEROWYJ
789 
RGB565ToYRow_C(const uint8_t * src_rgb565,uint8_t * dst_y,int width)790 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
791   int x;
792   for (x = 0; x < width; ++x) {
793     uint8_t b = src_rgb565[0] & 0x1f;
794     uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
795     uint8_t r = src_rgb565[1] >> 3;
796     b = (b << 3) | (b >> 2);
797     g = (g << 2) | (g >> 4);
798     r = (r << 3) | (r >> 2);
799     dst_y[0] = RGBToY(r, g, b);
800     src_rgb565 += 2;
801     dst_y += 1;
802   }
803 }
804 
ARGB1555ToYRow_C(const uint8_t * src_argb1555,uint8_t * dst_y,int width)805 void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
806   int x;
807   for (x = 0; x < width; ++x) {
808     uint8_t b = src_argb1555[0] & 0x1f;
809     uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
810     uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
811     b = (b << 3) | (b >> 2);
812     g = (g << 3) | (g >> 2);
813     r = (r << 3) | (r >> 2);
814     dst_y[0] = RGBToY(r, g, b);
815     src_argb1555 += 2;
816     dst_y += 1;
817   }
818 }
819 
ARGB4444ToYRow_C(const uint8_t * src_argb4444,uint8_t * dst_y,int width)820 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
821   int x;
822   for (x = 0; x < width; ++x) {
823     uint8_t b = src_argb4444[0] & 0x0f;
824     uint8_t g = src_argb4444[0] >> 4;
825     uint8_t r = src_argb4444[1] & 0x0f;
826     b = (b << 4) | b;
827     g = (g << 4) | g;
828     r = (r << 4) | r;
829     dst_y[0] = RGBToY(r, g, b);
830     src_argb4444 += 2;
831     dst_y += 1;
832   }
833 }
834 
RGB565ToUVRow_C(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)835 void RGB565ToUVRow_C(const uint8_t* src_rgb565,
836                      int src_stride_rgb565,
837                      uint8_t* dst_u,
838                      uint8_t* dst_v,
839                      int width) {
840   const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
841   int x;
842   for (x = 0; x < width - 1; x += 2) {
843     uint8_t b0 = src_rgb565[0] & 0x1f;
844     uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
845     uint8_t r0 = src_rgb565[1] >> 3;
846     uint8_t b1 = src_rgb565[2] & 0x1f;
847     uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
848     uint8_t r1 = src_rgb565[3] >> 3;
849     uint8_t b2 = next_rgb565[0] & 0x1f;
850     uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
851     uint8_t r2 = next_rgb565[1] >> 3;
852     uint8_t b3 = next_rgb565[2] & 0x1f;
853     uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
854     uint8_t r3 = next_rgb565[3] >> 3;
855 
856     b0 = (b0 << 3) | (b0 >> 2);
857     g0 = (g0 << 2) | (g0 >> 4);
858     r0 = (r0 << 3) | (r0 >> 2);
859     b1 = (b1 << 3) | (b1 >> 2);
860     g1 = (g1 << 2) | (g1 >> 4);
861     r1 = (r1 << 3) | (r1 >> 2);
862     b2 = (b2 << 3) | (b2 >> 2);
863     g2 = (g2 << 2) | (g2 >> 4);
864     r2 = (r2 << 3) | (r2 >> 2);
865     b3 = (b3 << 3) | (b3 >> 2);
866     g3 = (g3 << 2) | (g3 >> 4);
867     r3 = (r3 << 3) | (r3 >> 2);
868 
869 #if LIBYUV_ARGBTOUV_PAVGB
870     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
871     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
872     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
873     dst_u[0] = RGBToU(ar, ag, ab);
874     dst_v[0] = RGBToV(ar, ag, ab);
875 #else
876     uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
877     uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
878     uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
879     dst_u[0] = RGB2xToU(r, g, b);
880     dst_v[0] = RGB2xToV(r, g, b);
881 #endif
882 
883     src_rgb565 += 4;
884     next_rgb565 += 4;
885     dst_u += 1;
886     dst_v += 1;
887   }
888   if (width & 1) {
889     uint8_t b0 = src_rgb565[0] & 0x1f;
890     uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
891     uint8_t r0 = src_rgb565[1] >> 3;
892     uint8_t b2 = next_rgb565[0] & 0x1f;
893     uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
894     uint8_t r2 = next_rgb565[1] >> 3;
895 
896     b0 = (b0 << 3) | (b0 >> 2);
897     g0 = (g0 << 2) | (g0 >> 4);
898     r0 = (r0 << 3) | (r0 >> 2);
899     b2 = (b2 << 3) | (b2 >> 2);
900     g2 = (g2 << 2) | (g2 >> 4);
901     r2 = (r2 << 3) | (r2 >> 2);
902 
903 #if LIBYUV_ARGBTOUV_PAVGB
904     uint8_t ab = AVGB(b0, b2);
905     uint8_t ag = AVGB(g0, g2);
906     uint8_t ar = AVGB(r0, r2);
907     dst_u[0] = RGBToU(ar, ag, ab);
908     dst_v[0] = RGBToV(ar, ag, ab);
909 #else
910     uint16_t b = b0 + b2;
911     uint16_t g = g0 + g2;
912     uint16_t r = r0 + r2;
913     dst_u[0] = RGB2xToU(r, g, b);
914     dst_v[0] = RGB2xToV(r, g, b);
915 #endif
916   }
917 }
918 
ARGB1555ToUVRow_C(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)919 void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
920                        int src_stride_argb1555,
921                        uint8_t* dst_u,
922                        uint8_t* dst_v,
923                        int width) {
924   const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
925   int x;
926   for (x = 0; x < width - 1; x += 2) {
927     uint8_t b0 = src_argb1555[0] & 0x1f;
928     uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
929     uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
930     uint8_t b1 = src_argb1555[2] & 0x1f;
931     uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
932     uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
933     uint8_t b2 = next_argb1555[0] & 0x1f;
934     uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
935     uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
936     uint8_t b3 = next_argb1555[2] & 0x1f;
937     uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
938     uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
939 
940     b0 = (b0 << 3) | (b0 >> 2);
941     g0 = (g0 << 3) | (g0 >> 2);
942     r0 = (r0 << 3) | (r0 >> 2);
943     b1 = (b1 << 3) | (b1 >> 2);
944     g1 = (g1 << 3) | (g1 >> 2);
945     r1 = (r1 << 3) | (r1 >> 2);
946     b2 = (b2 << 3) | (b2 >> 2);
947     g2 = (g2 << 3) | (g2 >> 2);
948     r2 = (r2 << 3) | (r2 >> 2);
949     b3 = (b3 << 3) | (b3 >> 2);
950     g3 = (g3 << 3) | (g3 >> 2);
951     r3 = (r3 << 3) | (r3 >> 2);
952 
953 #if LIBYUV_ARGBTOUV_PAVGB
954     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
955     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
956     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
957     dst_u[0] = RGBToU(ar, ag, ab);
958     dst_v[0] = RGBToV(ar, ag, ab);
959 #else
960     uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
961     uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
962     uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
963     dst_u[0] = RGB2xToU(r, g, b);
964     dst_v[0] = RGB2xToV(r, g, b);
965 #endif
966 
967     src_argb1555 += 4;
968     next_argb1555 += 4;
969     dst_u += 1;
970     dst_v += 1;
971   }
972   if (width & 1) {
973     uint8_t b0 = src_argb1555[0] & 0x1f;
974     uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
975     uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
976     uint8_t b2 = next_argb1555[0] & 0x1f;
977     uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
978     uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
979 
980     b0 = (b0 << 3) | (b0 >> 2);
981     g0 = (g0 << 3) | (g0 >> 2);
982     r0 = (r0 << 3) | (r0 >> 2);
983     b2 = (b2 << 3) | (b2 >> 2);
984     g2 = (g2 << 3) | (g2 >> 2);
985     r2 = (r2 << 3) | (r2 >> 2);
986 
987 #if LIBYUV_ARGBTOUV_PAVGB
988     uint8_t ab = AVGB(b0, b2);
989     uint8_t ag = AVGB(g0, g2);
990     uint8_t ar = AVGB(r0, r2);
991     dst_u[0] = RGBToU(ar, ag, ab);
992     dst_v[0] = RGBToV(ar, ag, ab);
993 #else
994     uint16_t b = b0 + b2;
995     uint16_t g = g0 + g2;
996     uint16_t r = r0 + r2;
997     dst_u[0] = RGB2xToU(r, g, b);
998     dst_v[0] = RGB2xToV(r, g, b);
999 #endif
1000   }
1001 }
1002 
ARGB4444ToUVRow_C(const uint8_t * src_argb4444,int src_stride_argb4444,uint8_t * dst_u,uint8_t * dst_v,int width)1003 void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
1004                        int src_stride_argb4444,
1005                        uint8_t* dst_u,
1006                        uint8_t* dst_v,
1007                        int width) {
1008   const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;
1009   int x;
1010   for (x = 0; x < width - 1; x += 2) {
1011     uint8_t b0 = src_argb4444[0] & 0x0f;
1012     uint8_t g0 = src_argb4444[0] >> 4;
1013     uint8_t r0 = src_argb4444[1] & 0x0f;
1014     uint8_t b1 = src_argb4444[2] & 0x0f;
1015     uint8_t g1 = src_argb4444[2] >> 4;
1016     uint8_t r1 = src_argb4444[3] & 0x0f;
1017     uint8_t b2 = next_argb4444[0] & 0x0f;
1018     uint8_t g2 = next_argb4444[0] >> 4;
1019     uint8_t r2 = next_argb4444[1] & 0x0f;
1020     uint8_t b3 = next_argb4444[2] & 0x0f;
1021     uint8_t g3 = next_argb4444[2] >> 4;
1022     uint8_t r3 = next_argb4444[3] & 0x0f;
1023 
1024     b0 = (b0 << 4) | b0;
1025     g0 = (g0 << 4) | g0;
1026     r0 = (r0 << 4) | r0;
1027     b1 = (b1 << 4) | b1;
1028     g1 = (g1 << 4) | g1;
1029     r1 = (r1 << 4) | r1;
1030     b2 = (b2 << 4) | b2;
1031     g2 = (g2 << 4) | g2;
1032     r2 = (r2 << 4) | r2;
1033     b3 = (b3 << 4) | b3;
1034     g3 = (g3 << 4) | g3;
1035     r3 = (r3 << 4) | r3;
1036 
1037 #if LIBYUV_ARGBTOUV_PAVGB
1038     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
1039     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
1040     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
1041     dst_u[0] = RGBToU(ar, ag, ab);
1042     dst_v[0] = RGBToV(ar, ag, ab);
1043 #else
1044     uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
1045     uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
1046     uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
1047     dst_u[0] = RGB2xToU(r, g, b);
1048     dst_v[0] = RGB2xToV(r, g, b);
1049 #endif
1050 
1051     src_argb4444 += 4;
1052     next_argb4444 += 4;
1053     dst_u += 1;
1054     dst_v += 1;
1055   }
1056   if (width & 1) {
1057     uint8_t b0 = src_argb4444[0] & 0x0f;
1058     uint8_t g0 = src_argb4444[0] >> 4;
1059     uint8_t r0 = src_argb4444[1] & 0x0f;
1060     uint8_t b2 = next_argb4444[0] & 0x0f;
1061     uint8_t g2 = next_argb4444[0] >> 4;
1062     uint8_t r2 = next_argb4444[1] & 0x0f;
1063 
1064     b0 = (b0 << 4) | b0;
1065     g0 = (g0 << 4) | g0;
1066     r0 = (r0 << 4) | r0;
1067     b2 = (b2 << 4) | b2;
1068     g2 = (g2 << 4) | g2;
1069     r2 = (r2 << 4) | r2;
1070 
1071 #if LIBYUV_ARGBTOUV_PAVGB
1072     uint8_t ab = AVGB(b0, b2);
1073     uint8_t ag = AVGB(g0, g2);
1074     uint8_t ar = AVGB(r0, r2);
1075     dst_u[0] = RGBToU(ar, ag, ab);
1076     dst_v[0] = RGBToV(ar, ag, ab);
1077 #else
1078     uint16_t b = b0 + b2;
1079     uint16_t g = g0 + g2;
1080     uint16_t r = r0 + r2;
1081     dst_u[0] = RGB2xToU(r, g, b);
1082     dst_v[0] = RGB2xToV(r, g, b);
1083 #endif
1084   }
1085 }
1086 
ARGBToUV444Row_C(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1087 void ARGBToUV444Row_C(const uint8_t* src_argb,
1088                       uint8_t* dst_u,
1089                       uint8_t* dst_v,
1090                       int width) {
1091   int x;
1092   for (x = 0; x < width; ++x) {
1093     uint8_t ab = src_argb[0];
1094     uint8_t ag = src_argb[1];
1095     uint8_t ar = src_argb[2];
1096     dst_u[0] = RGBToU(ar, ag, ab);
1097     dst_v[0] = RGBToV(ar, ag, ab);
1098     src_argb += 4;
1099     dst_u += 1;
1100     dst_v += 1;
1101   }
1102 }
1103 
ARGBGrayRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width)1104 void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1105   int x;
1106   for (x = 0; x < width; ++x) {
1107     uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
1108     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
1109     dst_argb[3] = src_argb[3];
1110     dst_argb += 4;
1111     src_argb += 4;
1112   }
1113 }
1114 
1115 // Convert a row of image to Sepia tone.
ARGBSepiaRow_C(uint8_t * dst_argb,int width)1116 void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
1117   int x;
1118   for (x = 0; x < width; ++x) {
1119     int b = dst_argb[0];
1120     int g = dst_argb[1];
1121     int r = dst_argb[2];
1122     int sb = (b * 17 + g * 68 + r * 35) >> 7;
1123     int sg = (b * 22 + g * 88 + r * 45) >> 7;
1124     int sr = (b * 24 + g * 98 + r * 50) >> 7;
1125     // b does not over flow. a is preserved from original.
1126     dst_argb[0] = sb;
1127     dst_argb[1] = clamp255(sg);
1128     dst_argb[2] = clamp255(sr);
1129     dst_argb += 4;
1130   }
1131 }
1132 
1133 // Apply color matrix to a row of image. Matrix is signed.
1134 // TODO(fbarchard): Consider adding rounding (+32).
ARGBColorMatrixRow_C(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)1135 void ARGBColorMatrixRow_C(const uint8_t* src_argb,
1136                           uint8_t* dst_argb,
1137                           const int8_t* matrix_argb,
1138                           int width) {
1139   int x;
1140   for (x = 0; x < width; ++x) {
1141     int b = src_argb[0];
1142     int g = src_argb[1];
1143     int r = src_argb[2];
1144     int a = src_argb[3];
1145     int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
1146               a * matrix_argb[3]) >>
1147              6;
1148     int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
1149               a * matrix_argb[7]) >>
1150              6;
1151     int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
1152               a * matrix_argb[11]) >>
1153              6;
1154     int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
1155               a * matrix_argb[15]) >>
1156              6;
1157     dst_argb[0] = Clamp(sb);
1158     dst_argb[1] = Clamp(sg);
1159     dst_argb[2] = Clamp(sr);
1160     dst_argb[3] = Clamp(sa);
1161     src_argb += 4;
1162     dst_argb += 4;
1163   }
1164 }
1165 
1166 // Apply color table to a row of image.
ARGBColorTableRow_C(uint8_t * dst_argb,const uint8_t * table_argb,int width)1167 void ARGBColorTableRow_C(uint8_t* dst_argb,
1168                          const uint8_t* table_argb,
1169                          int width) {
1170   int x;
1171   for (x = 0; x < width; ++x) {
1172     int b = dst_argb[0];
1173     int g = dst_argb[1];
1174     int r = dst_argb[2];
1175     int a = dst_argb[3];
1176     dst_argb[0] = table_argb[b * 4 + 0];
1177     dst_argb[1] = table_argb[g * 4 + 1];
1178     dst_argb[2] = table_argb[r * 4 + 2];
1179     dst_argb[3] = table_argb[a * 4 + 3];
1180     dst_argb += 4;
1181   }
1182 }
1183 
1184 // Apply color table to a row of image.
RGBColorTableRow_C(uint8_t * dst_argb,const uint8_t * table_argb,int width)1185 void RGBColorTableRow_C(uint8_t* dst_argb,
1186                         const uint8_t* table_argb,
1187                         int width) {
1188   int x;
1189   for (x = 0; x < width; ++x) {
1190     int b = dst_argb[0];
1191     int g = dst_argb[1];
1192     int r = dst_argb[2];
1193     dst_argb[0] = table_argb[b * 4 + 0];
1194     dst_argb[1] = table_argb[g * 4 + 1];
1195     dst_argb[2] = table_argb[r * 4 + 2];
1196     dst_argb += 4;
1197   }
1198 }
1199 
ARGBQuantizeRow_C(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)1200 void ARGBQuantizeRow_C(uint8_t* dst_argb,
1201                        int scale,
1202                        int interval_size,
1203                        int interval_offset,
1204                        int width) {
1205   int x;
1206   for (x = 0; x < width; ++x) {
1207     int b = dst_argb[0];
1208     int g = dst_argb[1];
1209     int r = dst_argb[2];
1210     dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
1211     dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
1212     dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
1213     dst_argb += 4;
1214   }
1215 }
1216 
1217 #define REPEAT8(v) (v) | ((v) << 8)
1218 #define SHADE(f, v) v* f >> 24
1219 
ARGBShadeRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)1220 void ARGBShadeRow_C(const uint8_t* src_argb,
1221                     uint8_t* dst_argb,
1222                     int width,
1223                     uint32_t value) {
1224   const uint32_t b_scale = REPEAT8(value & 0xff);
1225   const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);
1226   const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);
1227   const uint32_t a_scale = REPEAT8(value >> 24);
1228 
1229   int i;
1230   for (i = 0; i < width; ++i) {
1231     const uint32_t b = REPEAT8(src_argb[0]);
1232     const uint32_t g = REPEAT8(src_argb[1]);
1233     const uint32_t r = REPEAT8(src_argb[2]);
1234     const uint32_t a = REPEAT8(src_argb[3]);
1235     dst_argb[0] = SHADE(b, b_scale);
1236     dst_argb[1] = SHADE(g, g_scale);
1237     dst_argb[2] = SHADE(r, r_scale);
1238     dst_argb[3] = SHADE(a, a_scale);
1239     src_argb += 4;
1240     dst_argb += 4;
1241   }
1242 }
1243 #undef REPEAT8
1244 #undef SHADE
1245 
1246 #define REPEAT8(v) (v) | ((v) << 8)
1247 #define SHADE(f, v) v* f >> 16
1248 
ARGBMultiplyRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1249 void ARGBMultiplyRow_C(const uint8_t* src_argb,
1250                        const uint8_t* src_argb1,
1251                        uint8_t* dst_argb,
1252                        int width) {
1253   int i;
1254   for (i = 0; i < width; ++i) {
1255     const uint32_t b = REPEAT8(src_argb[0]);
1256     const uint32_t g = REPEAT8(src_argb[1]);
1257     const uint32_t r = REPEAT8(src_argb[2]);
1258     const uint32_t a = REPEAT8(src_argb[3]);
1259     const uint32_t b_scale = src_argb1[0];
1260     const uint32_t g_scale = src_argb1[1];
1261     const uint32_t r_scale = src_argb1[2];
1262     const uint32_t a_scale = src_argb1[3];
1263     dst_argb[0] = SHADE(b, b_scale);
1264     dst_argb[1] = SHADE(g, g_scale);
1265     dst_argb[2] = SHADE(r, r_scale);
1266     dst_argb[3] = SHADE(a, a_scale);
1267     src_argb += 4;
1268     src_argb1 += 4;
1269     dst_argb += 4;
1270   }
1271 }
1272 #undef REPEAT8
1273 #undef SHADE
1274 
1275 #define SHADE(f, v) clamp255(v + f)
1276 
ARGBAddRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1277 void ARGBAddRow_C(const uint8_t* src_argb,
1278                   const uint8_t* src_argb1,
1279                   uint8_t* dst_argb,
1280                   int width) {
1281   int i;
1282   for (i = 0; i < width; ++i) {
1283     const int b = src_argb[0];
1284     const int g = src_argb[1];
1285     const int r = src_argb[2];
1286     const int a = src_argb[3];
1287     const int b_add = src_argb1[0];
1288     const int g_add = src_argb1[1];
1289     const int r_add = src_argb1[2];
1290     const int a_add = src_argb1[3];
1291     dst_argb[0] = SHADE(b, b_add);
1292     dst_argb[1] = SHADE(g, g_add);
1293     dst_argb[2] = SHADE(r, r_add);
1294     dst_argb[3] = SHADE(a, a_add);
1295     src_argb += 4;
1296     src_argb1 += 4;
1297     dst_argb += 4;
1298   }
1299 }
1300 #undef SHADE
1301 
1302 #define SHADE(f, v) clamp0(f - v)
1303 
ARGBSubtractRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1304 void ARGBSubtractRow_C(const uint8_t* src_argb,
1305                        const uint8_t* src_argb1,
1306                        uint8_t* dst_argb,
1307                        int width) {
1308   int i;
1309   for (i = 0; i < width; ++i) {
1310     const int b = src_argb[0];
1311     const int g = src_argb[1];
1312     const int r = src_argb[2];
1313     const int a = src_argb[3];
1314     const int b_sub = src_argb1[0];
1315     const int g_sub = src_argb1[1];
1316     const int r_sub = src_argb1[2];
1317     const int a_sub = src_argb1[3];
1318     dst_argb[0] = SHADE(b, b_sub);
1319     dst_argb[1] = SHADE(g, g_sub);
1320     dst_argb[2] = SHADE(r, r_sub);
1321     dst_argb[3] = SHADE(a, a_sub);
1322     src_argb += 4;
1323     src_argb1 += 4;
1324     dst_argb += 4;
1325   }
1326 }
1327 #undef SHADE
1328 
1329 // Sobel functions which mimics SSSE3.
SobelXRow_C(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)1330 void SobelXRow_C(const uint8_t* src_y0,
1331                  const uint8_t* src_y1,
1332                  const uint8_t* src_y2,
1333                  uint8_t* dst_sobelx,
1334                  int width) {
1335   int i;
1336   for (i = 0; i < width; ++i) {
1337     int a = src_y0[i];
1338     int b = src_y1[i];
1339     int c = src_y2[i];
1340     int a_sub = src_y0[i + 2];
1341     int b_sub = src_y1[i + 2];
1342     int c_sub = src_y2[i + 2];
1343     int a_diff = a - a_sub;
1344     int b_diff = b - b_sub;
1345     int c_diff = c - c_sub;
1346     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
1347     dst_sobelx[i] = (uint8_t)(clamp255(sobel));
1348   }
1349 }
1350 
SobelYRow_C(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)1351 void SobelYRow_C(const uint8_t* src_y0,
1352                  const uint8_t* src_y1,
1353                  uint8_t* dst_sobely,
1354                  int width) {
1355   int i;
1356   for (i = 0; i < width; ++i) {
1357     int a = src_y0[i + 0];
1358     int b = src_y0[i + 1];
1359     int c = src_y0[i + 2];
1360     int a_sub = src_y1[i + 0];
1361     int b_sub = src_y1[i + 1];
1362     int c_sub = src_y1[i + 2];
1363     int a_diff = a - a_sub;
1364     int b_diff = b - b_sub;
1365     int c_diff = c - c_sub;
1366     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
1367     dst_sobely[i] = (uint8_t)(clamp255(sobel));
1368   }
1369 }
1370 
SobelRow_C(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)1371 void SobelRow_C(const uint8_t* src_sobelx,
1372                 const uint8_t* src_sobely,
1373                 uint8_t* dst_argb,
1374                 int width) {
1375   int i;
1376   for (i = 0; i < width; ++i) {
1377     int r = src_sobelx[i];
1378     int b = src_sobely[i];
1379     int s = clamp255(r + b);
1380     dst_argb[0] = (uint8_t)(s);
1381     dst_argb[1] = (uint8_t)(s);
1382     dst_argb[2] = (uint8_t)(s);
1383     dst_argb[3] = (uint8_t)(255u);
1384     dst_argb += 4;
1385   }
1386 }
1387 
SobelToPlaneRow_C(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)1388 void SobelToPlaneRow_C(const uint8_t* src_sobelx,
1389                        const uint8_t* src_sobely,
1390                        uint8_t* dst_y,
1391                        int width) {
1392   int i;
1393   for (i = 0; i < width; ++i) {
1394     int r = src_sobelx[i];
1395     int b = src_sobely[i];
1396     int s = clamp255(r + b);
1397     dst_y[i] = (uint8_t)(s);
1398   }
1399 }
1400 
SobelXYRow_C(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)1401 void SobelXYRow_C(const uint8_t* src_sobelx,
1402                   const uint8_t* src_sobely,
1403                   uint8_t* dst_argb,
1404                   int width) {
1405   int i;
1406   for (i = 0; i < width; ++i) {
1407     int r = src_sobelx[i];
1408     int b = src_sobely[i];
1409     int g = clamp255(r + b);
1410     dst_argb[0] = (uint8_t)(b);
1411     dst_argb[1] = (uint8_t)(g);
1412     dst_argb[2] = (uint8_t)(r);
1413     dst_argb[3] = (uint8_t)(255u);
1414     dst_argb += 4;
1415   }
1416 }
1417 
J400ToARGBRow_C(const uint8_t * src_y,uint8_t * dst_argb,int width)1418 void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
1419   // Copy a Y to RGB.
1420   int x;
1421   for (x = 0; x < width; ++x) {
1422     uint8_t y = src_y[0];
1423     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
1424     dst_argb[3] = 255u;
1425     dst_argb += 4;
1426     ++src_y;
1427   }
1428 }
1429 
1430 // Macros to create SIMD specific yuv to rgb conversion constants.
1431 
1432 // clang-format off
1433 
1434 #if defined(__aarch64__) || defined(__arm__)
1435 // Bias values include subtract 128 from U and V, bias from Y and rounding.
1436 // For B and R bias is negative. For G bias is positive.
1437 #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
1438   {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},                     \
1439    {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
1440     0, 0}}
1441 #else
1442 #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
1443   {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
1444     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
1445    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
1446     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
1447    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
1448     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
1449    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
1450    {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
1451 #endif
1452 
1453 // clang-format on
1454 
1455 #define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
1456   const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
1457       YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
1458   const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
1459       YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
1460 
1461 // TODO(fbarchard): Generate SIMD structures from float matrix.
1462 
1463 // BT.601 limited range YUV to RGB reference
1464 //  R = (Y - 16) * 1.164             + V * 1.596
1465 //  G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
1466 //  B = (Y - 16) * 1.164 + U * 2.018
1467 // KR = 0.299; KB = 0.114
1468 
1469 // U and V contributions to R,G,B.
1470 #if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601)
1471 #define UB 129 /* round(2.018 * 64) */
1472 #else
1473 #define UB 128 /* max(128, round(2.018 * 64)) */
1474 #endif
1475 #define UG 25  /* round(0.391 * 64) */
1476 #define VG 52  /* round(0.813 * 64) */
1477 #define VR 102 /* round(1.596 * 64) */
1478 
1479 // Y contribution to R,G,B.  Scale and bias.
1480 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1481 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1482 
MAKEYUVCONSTANTS(I601,YG,YB,UB,UG,VG,VR)1483 MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
1484 
1485 #undef YG
1486 #undef YB
1487 #undef UB
1488 #undef UG
1489 #undef VG
1490 #undef VR
1491 
1492 // BT.601 full range YUV to RGB reference (aka JPEG)
1493 // *  R = Y               + V * 1.40200
1494 // *  G = Y - U * 0.34414 - V * 0.71414
1495 // *  B = Y + U * 1.77200
1496 // KR = 0.299; KB = 0.114
1497 
1498 // U and V contributions to R,G,B.
1499 #define UB 113 /* round(1.77200 * 64) */
1500 #define UG 22  /* round(0.34414 * 64) */
1501 #define VG 46  /* round(0.71414 * 64) */
1502 #define VR 90  /* round(1.40200 * 64) */
1503 
1504 // Y contribution to R,G,B.  Scale and bias.
1505 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1506 #define YB 32    /* 64 / 2 */
1507 
1508 MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
1509 
1510 #undef YG
1511 #undef YB
1512 #undef UB
1513 #undef UG
1514 #undef VG
1515 #undef VR
1516 
1517 // BT.709 limited range YUV to RGB reference
1518 //  R = (Y - 16) * 1.164             + V * 1.793
1519 //  G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
1520 //  B = (Y - 16) * 1.164 + U * 2.112
1521 //  KR = 0.2126, KB = 0.0722
1522 
1523 // U and V contributions to R,G,B.
1524 #if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709)
1525 #define UB 135 /* round(2.112 * 64) */
1526 #else
1527 #define UB 128 /* max(128, round(2.112 * 64)) */
1528 #endif
1529 #define UG 14  /* round(0.213 * 64) */
1530 #define VG 34  /* round(0.533 * 64) */
1531 #define VR 115 /* round(1.793 * 64) */
1532 
1533 // Y contribution to R,G,B.  Scale and bias.
1534 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1535 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1536 
1537 MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
1538 
1539 #undef YG
1540 #undef YB
1541 #undef UB
1542 #undef UG
1543 #undef VG
1544 #undef VR
1545 
1546 // BT.709 full range YUV to RGB reference
1547 //  R = Y               + V * 1.5748
1548 //  G = Y - U * 0.18732 - V * 0.46812
1549 //  B = Y + U * 1.8556
1550 //  KR = 0.2126, KB = 0.0722
1551 
1552 // U and V contributions to R,G,B.
1553 #define UB 119 /* round(1.8556 * 64) */
1554 #define UG 12  /* round(0.18732 * 64) */
1555 #define VG 30  /* round(0.46812 * 64) */
1556 #define VR 101 /* round(1.5748 * 64) */
1557 
1558 // Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
1559 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
1560 #define YB 32    /* 64 / 2 */
1561 
1562 MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
1563 
1564 #undef YG
1565 #undef YB
1566 #undef UB
1567 #undef UG
1568 #undef VG
1569 #undef VR
1570 
1571 // BT.2020 limited range YUV to RGB reference
1572 //  R = (Y - 16) * 1.164384                + V * 1.67867
1573 //  G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
1574 //  B = (Y - 16) * 1.164384 + U * 2.14177
1575 // KR = 0.2627; KB = 0.0593
1576 
1577 // U and V contributions to R,G,B.
1578 #if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020)
1579 #define UB 137 /* round(2.142 * 64) */
1580 #else
1581 #define UB 128 /* max(128, round(2.142 * 64)) */
1582 #endif
1583 #define UG 12  /* round(0.187326 * 64) */
1584 #define VG 42  /* round(0.65042 * 64) */
1585 #define VR 107 /* round(1.67867 * 64) */
1586 
1587 // Y contribution to R,G,B.  Scale and bias.
1588 #define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
1589 #define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
1590 
1591 MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
1592 
1593 #undef YG
1594 #undef YB
1595 #undef UB
1596 #undef UG
1597 #undef VG
1598 #undef VR
1599 
1600 // BT.2020 full range YUV to RGB reference
1601 //  R = Y                + V * 1.474600
1602 //  G = Y - U * 0.164553 - V * 0.571353
1603 //  B = Y + U * 1.881400
1604 // KR = 0.2627; KB = 0.0593
1605 
1606 #define UB 120 /* round(1.881400 * 64) */
1607 #define UG 11  /* round(0.164553 * 64) */
1608 #define VG 37  /* round(0.571353 * 64) */
1609 #define VR 94  /* round(1.474600 * 64) */
1610 
1611 // Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
1612 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
1613 #define YB 32    /* 64 / 2 */
1614 
1615 MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
1616 
1617 #undef YG
1618 #undef YB
1619 #undef UB
1620 #undef UG
1621 #undef VG
1622 #undef VR
1623 
1624 #undef BB
1625 #undef BG
1626 #undef BR
1627 
1628 #undef MAKEYUVCONSTANTS
1629 
1630 #if defined(__aarch64__) || defined(__arm__)
1631 #define LOAD_YUV_CONSTANTS                 \
1632   int ub = yuvconstants->kUVCoeff[0];      \
1633   int vr = yuvconstants->kUVCoeff[1];      \
1634   int ug = yuvconstants->kUVCoeff[2];      \
1635   int vg = yuvconstants->kUVCoeff[3];      \
1636   int yg = yuvconstants->kRGBCoeffBias[0]; \
1637   int bb = yuvconstants->kRGBCoeffBias[1]; \
1638   int bg = yuvconstants->kRGBCoeffBias[2]; \
1639   int br = yuvconstants->kRGBCoeffBias[3]
1640 
1641 #define CALC_RGB16                         \
1642   int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
1643   int b16 = y1 + (u * ub) - bb;            \
1644   int g16 = y1 + bg - (u * ug + v * vg);   \
1645   int r16 = y1 + (v * vr) - br
1646 #else
1647 #define LOAD_YUV_CONSTANTS           \
1648   int ub = yuvconstants->kUVToB[0];  \
1649   int ug = yuvconstants->kUVToG[0];  \
1650   int vg = yuvconstants->kUVToG[1];  \
1651   int vr = yuvconstants->kUVToR[1];  \
1652   int yg = yuvconstants->kYToRgb[0]; \
1653   int yb = yuvconstants->kYBiasToRgb[0]
1654 
1655 #define CALC_RGB16                                \
1656   int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
1657   int8_t ui = (int8_t)u;                          \
1658   int8_t vi = (int8_t)v;                          \
1659   ui -= 0x80;                                     \
1660   vi -= 0x80;                                     \
1661   int b16 = y1 + (ui * ub);                       \
1662   int g16 = y1 - (ui * ug + vi * vg);             \
1663   int r16 = y1 + (vi * vr)
1664 #endif
1665 
1666 // C reference code that mimics the YUV assembly.
1667 // Reads 8 bit YUV and leaves result as 16 bit.
1668 static __inline void YuvPixel(uint8_t y,
1669                               uint8_t u,
1670                               uint8_t v,
1671                               uint8_t* b,
1672                               uint8_t* g,
1673                               uint8_t* r,
1674                               const struct YuvConstants* yuvconstants) {
1675   LOAD_YUV_CONSTANTS;
1676   uint32_t y32 = y * 0x0101;
1677   CALC_RGB16;
1678   *b = Clamp((int32_t)(b16) >> 6);
1679   *g = Clamp((int32_t)(g16) >> 6);
1680   *r = Clamp((int32_t)(r16) >> 6);
1681 }
1682 
1683 // Reads 8 bit YUV and leaves result as 16 bit.
YuvPixel8_16(uint8_t y,uint8_t u,uint8_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1684 static __inline void YuvPixel8_16(uint8_t y,
1685                                   uint8_t u,
1686                                   uint8_t v,
1687                                   int* b,
1688                                   int* g,
1689                                   int* r,
1690                                   const struct YuvConstants* yuvconstants) {
1691   LOAD_YUV_CONSTANTS;
1692   uint32_t y32 = y * 0x0101;
1693   CALC_RGB16;
1694   *b = b16;
1695   *g = g16;
1696   *r = r16;
1697 }
1698 
1699 // C reference code that mimics the YUV 16 bit assembly.
1700 // Reads 10 bit YUV and leaves result as 16 bit.
YuvPixel10_16(uint16_t y,uint16_t u,uint16_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1701 static __inline void YuvPixel10_16(uint16_t y,
1702                                    uint16_t u,
1703                                    uint16_t v,
1704                                    int* b,
1705                                    int* g,
1706                                    int* r,
1707                                    const struct YuvConstants* yuvconstants) {
1708   LOAD_YUV_CONSTANTS;
1709   uint32_t y32 = y << 6;
1710   u = clamp255(u >> 2);
1711   v = clamp255(v >> 2);
1712   CALC_RGB16;
1713   *b = b16;
1714   *g = g16;
1715   *r = r16;
1716 }
1717 
1718 // C reference code that mimics the YUV 16 bit assembly.
1719 // Reads 12 bit YUV and leaves result as 16 bit.
YuvPixel12_16(int16_t y,int16_t u,int16_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1720 static __inline void YuvPixel12_16(int16_t y,
1721                                    int16_t u,
1722                                    int16_t v,
1723                                    int* b,
1724                                    int* g,
1725                                    int* r,
1726                                    const struct YuvConstants* yuvconstants) {
1727   LOAD_YUV_CONSTANTS;
1728   uint32_t y32 = y << 4;
1729   u = clamp255(u >> 4);
1730   v = clamp255(v >> 4);
1731   CALC_RGB16;
1732   *b = b16;
1733   *g = g16;
1734   *r = r16;
1735 }
1736 
1737 // C reference code that mimics the YUV 10 bit assembly.
1738 // Reads 10 bit YUV and clamps down to 8 bit RGB.
YuvPixel10(uint16_t y,uint16_t u,uint16_t v,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1739 static __inline void YuvPixel10(uint16_t y,
1740                                 uint16_t u,
1741                                 uint16_t v,
1742                                 uint8_t* b,
1743                                 uint8_t* g,
1744                                 uint8_t* r,
1745                                 const struct YuvConstants* yuvconstants) {
1746   int b16;
1747   int g16;
1748   int r16;
1749   YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
1750   *b = Clamp(b16 >> 6);
1751   *g = Clamp(g16 >> 6);
1752   *r = Clamp(r16 >> 6);
1753 }
1754 
1755 // C reference code that mimics the YUV 12 bit assembly.
1756 // Reads 12 bit YUV and clamps down to 8 bit RGB.
YuvPixel12(uint16_t y,uint16_t u,uint16_t v,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1757 static __inline void YuvPixel12(uint16_t y,
1758                                 uint16_t u,
1759                                 uint16_t v,
1760                                 uint8_t* b,
1761                                 uint8_t* g,
1762                                 uint8_t* r,
1763                                 const struct YuvConstants* yuvconstants) {
1764   int b16;
1765   int g16;
1766   int r16;
1767   YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
1768   *b = Clamp(b16 >> 6);
1769   *g = Clamp(g16 >> 6);
1770   *r = Clamp(r16 >> 6);
1771 }
1772 
1773 // C reference code that mimics the YUV 16 bit assembly.
1774 // Reads 16 bit YUV and leaves result as 8 bit.
YuvPixel16_8(uint16_t y,uint16_t u,uint16_t v,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1775 static __inline void YuvPixel16_8(uint16_t y,
1776                                   uint16_t u,
1777                                   uint16_t v,
1778                                   uint8_t* b,
1779                                   uint8_t* g,
1780                                   uint8_t* r,
1781                                   const struct YuvConstants* yuvconstants) {
1782   LOAD_YUV_CONSTANTS;
1783   uint32_t y32 = y;
1784   u = clamp255(u >> 8);
1785   v = clamp255(v >> 8);
1786   CALC_RGB16;
1787   *b = Clamp((int32_t)(b16) >> 6);
1788   *g = Clamp((int32_t)(g16) >> 6);
1789   *r = Clamp((int32_t)(r16) >> 6);
1790 }
1791 
1792 // C reference code that mimics the YUV 16 bit assembly.
1793 // Reads 16 bit YUV and leaves result as 16 bit.
YuvPixel16_16(uint16_t y,uint16_t u,uint16_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1794 static __inline void YuvPixel16_16(uint16_t y,
1795                                    uint16_t u,
1796                                    uint16_t v,
1797                                    int* b,
1798                                    int* g,
1799                                    int* r,
1800                                    const struct YuvConstants* yuvconstants) {
1801   LOAD_YUV_CONSTANTS;
1802   uint32_t y32 = y;
1803   u = clamp255(u >> 8);
1804   v = clamp255(v >> 8);
1805   CALC_RGB16;
1806   *b = b16;
1807   *g = g16;
1808   *r = r16;
1809 }
1810 
1811 // C reference code that mimics the YUV assembly.
1812 // Reads 8 bit YUV and leaves result as 8 bit.
YPixel(uint8_t y,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1813 static __inline void YPixel(uint8_t y,
1814                             uint8_t* b,
1815                             uint8_t* g,
1816                             uint8_t* r,
1817                             const struct YuvConstants* yuvconstants) {
1818 #if defined(__aarch64__) || defined(__arm__)
1819   int yg = yuvconstants->kRGBCoeffBias[0];
1820   int ygb = yuvconstants->kRGBCoeffBias[4];
1821 #else
1822   int ygb = yuvconstants->kYBiasToRgb[0];
1823   int yg = yuvconstants->kYToRgb[0];
1824 #endif
1825   uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
1826   *b = Clamp(((int32_t)(y1) + ygb) >> 6);
1827   *g = Clamp(((int32_t)(y1) + ygb) >> 6);
1828   *r = Clamp(((int32_t)(y1) + ygb) >> 6);
1829 }
1830 
I444ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1831 void I444ToARGBRow_C(const uint8_t* src_y,
1832                      const uint8_t* src_u,
1833                      const uint8_t* src_v,
1834                      uint8_t* rgb_buf,
1835                      const struct YuvConstants* yuvconstants,
1836                      int width) {
1837   int x;
1838   for (x = 0; x < width; ++x) {
1839     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1840              rgb_buf + 2, yuvconstants);
1841     rgb_buf[3] = 255;
1842     src_y += 1;
1843     src_u += 1;
1844     src_v += 1;
1845     rgb_buf += 4;  // Advance 1 pixel.
1846   }
1847 }
1848 
1849 // Also used for 420
I422ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1850 void I422ToARGBRow_C(const uint8_t* src_y,
1851                      const uint8_t* src_u,
1852                      const uint8_t* src_v,
1853                      uint8_t* rgb_buf,
1854                      const struct YuvConstants* yuvconstants,
1855                      int width) {
1856   int x;
1857   for (x = 0; x < width - 1; x += 2) {
1858     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1859              rgb_buf + 2, yuvconstants);
1860     rgb_buf[3] = 255;
1861     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1862              rgb_buf + 6, yuvconstants);
1863     rgb_buf[7] = 255;
1864     src_y += 2;
1865     src_u += 1;
1866     src_v += 1;
1867     rgb_buf += 8;  // Advance 2 pixels.
1868   }
1869   if (width & 1) {
1870     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1871              rgb_buf + 2, yuvconstants);
1872     rgb_buf[3] = 255;
1873   }
1874 }
1875 
1876 // 10 bit YUV to ARGB
I210ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1877 void I210ToARGBRow_C(const uint16_t* src_y,
1878                      const uint16_t* src_u,
1879                      const uint16_t* src_v,
1880                      uint8_t* rgb_buf,
1881                      const struct YuvConstants* yuvconstants,
1882                      int width) {
1883   int x;
1884   for (x = 0; x < width - 1; x += 2) {
1885     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1886                rgb_buf + 2, yuvconstants);
1887     rgb_buf[3] = 255;
1888     YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1889                rgb_buf + 6, yuvconstants);
1890     rgb_buf[7] = 255;
1891     src_y += 2;
1892     src_u += 1;
1893     src_v += 1;
1894     rgb_buf += 8;  // Advance 2 pixels.
1895   }
1896   if (width & 1) {
1897     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1898                rgb_buf + 2, yuvconstants);
1899     rgb_buf[3] = 255;
1900   }
1901 }
1902 
I410ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1903 void I410ToARGBRow_C(const uint16_t* src_y,
1904                      const uint16_t* src_u,
1905                      const uint16_t* src_v,
1906                      uint8_t* rgb_buf,
1907                      const struct YuvConstants* yuvconstants,
1908                      int width) {
1909   int x;
1910   for (x = 0; x < width; ++x) {
1911     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1912                rgb_buf + 2, yuvconstants);
1913     rgb_buf[3] = 255;
1914     src_y += 1;
1915     src_u += 1;
1916     src_v += 1;
1917     rgb_buf += 4;  // Advance 1 pixels.
1918   }
1919 }
1920 
I210AlphaToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,const uint16_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1921 void I210AlphaToARGBRow_C(const uint16_t* src_y,
1922                           const uint16_t* src_u,
1923                           const uint16_t* src_v,
1924                           const uint16_t* src_a,
1925                           uint8_t* rgb_buf,
1926                           const struct YuvConstants* yuvconstants,
1927                           int width) {
1928   int x;
1929   for (x = 0; x < width - 1; x += 2) {
1930     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1931                rgb_buf + 2, yuvconstants);
1932     rgb_buf[3] = clamp255(src_a[0] >> 2);
1933     YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1934                rgb_buf + 6, yuvconstants);
1935     rgb_buf[7] = clamp255(src_a[1] >> 2);
1936     src_y += 2;
1937     src_u += 1;
1938     src_v += 1;
1939     src_a += 2;
1940     rgb_buf += 8;  // Advance 2 pixels.
1941   }
1942   if (width & 1) {
1943     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1944                rgb_buf + 2, yuvconstants);
1945     rgb_buf[3] = clamp255(src_a[0] >> 2);
1946   }
1947 }
1948 
I410AlphaToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,const uint16_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1949 void I410AlphaToARGBRow_C(const uint16_t* src_y,
1950                           const uint16_t* src_u,
1951                           const uint16_t* src_v,
1952                           const uint16_t* src_a,
1953                           uint8_t* rgb_buf,
1954                           const struct YuvConstants* yuvconstants,
1955                           int width) {
1956   int x;
1957   for (x = 0; x < width; ++x) {
1958     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1959                rgb_buf + 2, yuvconstants);
1960     rgb_buf[3] = clamp255(src_a[0] >> 2);
1961     src_y += 1;
1962     src_u += 1;
1963     src_v += 1;
1964     src_a += 1;
1965     rgb_buf += 4;  // Advance 1 pixels.
1966   }
1967 }
1968 
1969 // 12 bit YUV to ARGB
I212ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1970 void I212ToARGBRow_C(const uint16_t* src_y,
1971                      const uint16_t* src_u,
1972                      const uint16_t* src_v,
1973                      uint8_t* rgb_buf,
1974                      const struct YuvConstants* yuvconstants,
1975                      int width) {
1976   int x;
1977   for (x = 0; x < width - 1; x += 2) {
1978     YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1979                rgb_buf + 2, yuvconstants);
1980     rgb_buf[3] = 255;
1981     YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1982                rgb_buf + 6, yuvconstants);
1983     rgb_buf[7] = 255;
1984     src_y += 2;
1985     src_u += 1;
1986     src_v += 1;
1987     rgb_buf += 8;  // Advance 2 pixels.
1988   }
1989   if (width & 1) {
1990     YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1991                rgb_buf + 2, yuvconstants);
1992     rgb_buf[3] = 255;
1993   }
1994 }
1995 
StoreAR30(uint8_t * rgb_buf,int b,int g,int r)1996 static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
1997   uint32_t ar30;
1998   b = b >> 4;  // convert 8 bit 10.6 to 10 bit.
1999   g = g >> 4;
2000   r = r >> 4;
2001   b = Clamp10(b);
2002   g = Clamp10(g);
2003   r = Clamp10(r);
2004   ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;
2005   (*(uint32_t*)rgb_buf) = ar30;
2006 }
2007 
2008 // 10 bit YUV to 10 bit AR30
I210ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2009 void I210ToAR30Row_C(const uint16_t* src_y,
2010                      const uint16_t* src_u,
2011                      const uint16_t* src_v,
2012                      uint8_t* rgb_buf,
2013                      const struct YuvConstants* yuvconstants,
2014                      int width) {
2015   int x;
2016   int b;
2017   int g;
2018   int r;
2019   for (x = 0; x < width - 1; x += 2) {
2020     YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2021     StoreAR30(rgb_buf, b, g, r);
2022     YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2023     StoreAR30(rgb_buf + 4, b, g, r);
2024     src_y += 2;
2025     src_u += 1;
2026     src_v += 1;
2027     rgb_buf += 8;  // Advance 2 pixels.
2028   }
2029   if (width & 1) {
2030     YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2031     StoreAR30(rgb_buf, b, g, r);
2032   }
2033 }
2034 
2035 // 12 bit YUV to 10 bit AR30
I212ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2036 void I212ToAR30Row_C(const uint16_t* src_y,
2037                      const uint16_t* src_u,
2038                      const uint16_t* src_v,
2039                      uint8_t* rgb_buf,
2040                      const struct YuvConstants* yuvconstants,
2041                      int width) {
2042   int x;
2043   int b;
2044   int g;
2045   int r;
2046   for (x = 0; x < width - 1; x += 2) {
2047     YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2048     StoreAR30(rgb_buf, b, g, r);
2049     YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2050     StoreAR30(rgb_buf + 4, b, g, r);
2051     src_y += 2;
2052     src_u += 1;
2053     src_v += 1;
2054     rgb_buf += 8;  // Advance 2 pixels.
2055   }
2056   if (width & 1) {
2057     YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2058     StoreAR30(rgb_buf, b, g, r);
2059   }
2060 }
2061 
I410ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2062 void I410ToAR30Row_C(const uint16_t* src_y,
2063                      const uint16_t* src_u,
2064                      const uint16_t* src_v,
2065                      uint8_t* rgb_buf,
2066                      const struct YuvConstants* yuvconstants,
2067                      int width) {
2068   int x;
2069   int b;
2070   int g;
2071   int r;
2072   for (x = 0; x < width; ++x) {
2073     YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2074     StoreAR30(rgb_buf, b, g, r);
2075     src_y += 1;
2076     src_u += 1;
2077     src_v += 1;
2078     rgb_buf += 4;  // Advance 1 pixel.
2079   }
2080 }
2081 
2082 // P210 has 10 bits in msb of 16 bit NV12 style layout.
P210ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2083 void P210ToARGBRow_C(const uint16_t* src_y,
2084                      const uint16_t* src_uv,
2085                      uint8_t* dst_argb,
2086                      const struct YuvConstants* yuvconstants,
2087                      int width) {
2088   int x;
2089   for (x = 0; x < width - 1; x += 2) {
2090     YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
2091                  dst_argb + 2, yuvconstants);
2092     dst_argb[3] = 255;
2093     YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5,
2094                  dst_argb + 6, yuvconstants);
2095     dst_argb[7] = 255;
2096     src_y += 2;
2097     src_uv += 2;
2098     dst_argb += 8;  // Advance 2 pixels.
2099   }
2100   if (width & 1) {
2101     YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
2102                  dst_argb + 2, yuvconstants);
2103     dst_argb[3] = 255;
2104   }
2105 }
2106 
P410ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2107 void P410ToARGBRow_C(const uint16_t* src_y,
2108                      const uint16_t* src_uv,
2109                      uint8_t* dst_argb,
2110                      const struct YuvConstants* yuvconstants,
2111                      int width) {
2112   int x;
2113   for (x = 0; x < width; ++x) {
2114     YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
2115                  dst_argb + 2, yuvconstants);
2116     dst_argb[3] = 255;
2117     src_y += 1;
2118     src_uv += 2;
2119     dst_argb += 4;  // Advance 1 pixels.
2120   }
2121 }
2122 
P210ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2123 void P210ToAR30Row_C(const uint16_t* src_y,
2124                      const uint16_t* src_uv,
2125                      uint8_t* dst_ar30,
2126                      const struct YuvConstants* yuvconstants,
2127                      int width) {
2128   int x;
2129   int b;
2130   int g;
2131   int r;
2132   for (x = 0; x < width - 1; x += 2) {
2133     YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2134     StoreAR30(dst_ar30, b, g, r);
2135     YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2136     StoreAR30(dst_ar30 + 4, b, g, r);
2137     src_y += 2;
2138     src_uv += 2;
2139     dst_ar30 += 8;  // Advance 2 pixels.
2140   }
2141   if (width & 1) {
2142     YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2143     StoreAR30(dst_ar30, b, g, r);
2144   }
2145 }
2146 
P410ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2147 void P410ToAR30Row_C(const uint16_t* src_y,
2148                      const uint16_t* src_uv,
2149                      uint8_t* dst_ar30,
2150                      const struct YuvConstants* yuvconstants,
2151                      int width) {
2152   int x;
2153   int b;
2154   int g;
2155   int r;
2156   for (x = 0; x < width; ++x) {
2157     YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2158     StoreAR30(dst_ar30, b, g, r);
2159     src_y += 1;
2160     src_uv += 2;
2161     dst_ar30 += 4;  // Advance 1 pixel.
2162   }
2163 }
2164 
2165 // 8 bit YUV to 10 bit AR30
2166 // Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
I422ToAR30Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2167 void I422ToAR30Row_C(const uint8_t* src_y,
2168                      const uint8_t* src_u,
2169                      const uint8_t* src_v,
2170                      uint8_t* rgb_buf,
2171                      const struct YuvConstants* yuvconstants,
2172                      int width) {
2173   int x;
2174   int b;
2175   int g;
2176   int r;
2177   for (x = 0; x < width - 1; x += 2) {
2178     YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2179     StoreAR30(rgb_buf, b, g, r);
2180     YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2181     StoreAR30(rgb_buf + 4, b, g, r);
2182     src_y += 2;
2183     src_u += 1;
2184     src_v += 1;
2185     rgb_buf += 8;  // Advance 2 pixels.
2186   }
2187   if (width & 1) {
2188     YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2189     StoreAR30(rgb_buf, b, g, r);
2190   }
2191 }
2192 
I444AlphaToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2193 void I444AlphaToARGBRow_C(const uint8_t* src_y,
2194                           const uint8_t* src_u,
2195                           const uint8_t* src_v,
2196                           const uint8_t* src_a,
2197                           uint8_t* rgb_buf,
2198                           const struct YuvConstants* yuvconstants,
2199                           int width) {
2200   int x;
2201   for (x = 0; x < width; ++x) {
2202     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2203              rgb_buf + 2, yuvconstants);
2204     rgb_buf[3] = src_a[0];
2205     src_y += 1;
2206     src_u += 1;
2207     src_v += 1;
2208     src_a += 1;
2209     rgb_buf += 4;  // Advance 1 pixel.
2210   }
2211 }
2212 
I422AlphaToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2213 void I422AlphaToARGBRow_C(const uint8_t* src_y,
2214                           const uint8_t* src_u,
2215                           const uint8_t* src_v,
2216                           const uint8_t* src_a,
2217                           uint8_t* rgb_buf,
2218                           const struct YuvConstants* yuvconstants,
2219                           int width) {
2220   int x;
2221   for (x = 0; x < width - 1; x += 2) {
2222     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2223              rgb_buf + 2, yuvconstants);
2224     rgb_buf[3] = src_a[0];
2225     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
2226              rgb_buf + 6, yuvconstants);
2227     rgb_buf[7] = src_a[1];
2228     src_y += 2;
2229     src_u += 1;
2230     src_v += 1;
2231     src_a += 2;
2232     rgb_buf += 8;  // Advance 2 pixels.
2233   }
2234   if (width & 1) {
2235     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2236              rgb_buf + 2, yuvconstants);
2237     rgb_buf[3] = src_a[0];
2238   }
2239 }
2240 
I422ToRGB24Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2241 void I422ToRGB24Row_C(const uint8_t* src_y,
2242                       const uint8_t* src_u,
2243                       const uint8_t* src_v,
2244                       uint8_t* rgb_buf,
2245                       const struct YuvConstants* yuvconstants,
2246                       int width) {
2247   int x;
2248   for (x = 0; x < width - 1; x += 2) {
2249     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2250              rgb_buf + 2, yuvconstants);
2251     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
2252              rgb_buf + 5, yuvconstants);
2253     src_y += 2;
2254     src_u += 1;
2255     src_v += 1;
2256     rgb_buf += 6;  // Advance 2 pixels.
2257   }
2258   if (width & 1) {
2259     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2260              rgb_buf + 2, yuvconstants);
2261   }
2262 }
2263 
I422ToARGB4444Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)2264 void I422ToARGB4444Row_C(const uint8_t* src_y,
2265                          const uint8_t* src_u,
2266                          const uint8_t* src_v,
2267                          uint8_t* dst_argb4444,
2268                          const struct YuvConstants* yuvconstants,
2269                          int width) {
2270   uint8_t b0;
2271   uint8_t g0;
2272   uint8_t r0;
2273   uint8_t b1;
2274   uint8_t g1;
2275   uint8_t r1;
2276   int x;
2277   for (x = 0; x < width - 1; x += 2) {
2278     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2279     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
2280     b0 = b0 >> 4;
2281     g0 = g0 >> 4;
2282     r0 = r0 >> 4;
2283     b1 = b1 >> 4;
2284     g1 = g1 >> 4;
2285     r1 = r1 >> 4;
2286     *(uint16_t*)(dst_argb4444 + 0) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
2287     *(uint16_t*)(dst_argb4444 + 2) = b1 | (g1 << 4) | (r1 << 8) | 0xf000;
2288     src_y += 2;
2289     src_u += 1;
2290     src_v += 1;
2291     dst_argb4444 += 4;  // Advance 2 pixels.
2292   }
2293   if (width & 1) {
2294     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2295     b0 = b0 >> 4;
2296     g0 = g0 >> 4;
2297     r0 = r0 >> 4;
2298     *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
2299   }
2300 }
2301 
I422ToARGB1555Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)2302 void I422ToARGB1555Row_C(const uint8_t* src_y,
2303                          const uint8_t* src_u,
2304                          const uint8_t* src_v,
2305                          uint8_t* dst_argb1555,
2306                          const struct YuvConstants* yuvconstants,
2307                          int width) {
2308   uint8_t b0;
2309   uint8_t g0;
2310   uint8_t r0;
2311   uint8_t b1;
2312   uint8_t g1;
2313   uint8_t r1;
2314   int x;
2315   for (x = 0; x < width - 1; x += 2) {
2316     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2317     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
2318     b0 = b0 >> 3;
2319     g0 = g0 >> 3;
2320     r0 = r0 >> 3;
2321     b1 = b1 >> 3;
2322     g1 = g1 >> 3;
2323     r1 = r1 >> 3;
2324     *(uint16_t*)(dst_argb1555 + 0) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
2325     *(uint16_t*)(dst_argb1555 + 2) = b1 | (g1 << 5) | (r1 << 10) | 0x8000;
2326     src_y += 2;
2327     src_u += 1;
2328     src_v += 1;
2329     dst_argb1555 += 4;  // Advance 2 pixels.
2330   }
2331   if (width & 1) {
2332     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2333     b0 = b0 >> 3;
2334     g0 = g0 >> 3;
2335     r0 = r0 >> 3;
2336     *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
2337   }
2338 }
2339 
I422ToRGB565Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)2340 void I422ToRGB565Row_C(const uint8_t* src_y,
2341                        const uint8_t* src_u,
2342                        const uint8_t* src_v,
2343                        uint8_t* dst_rgb565,
2344                        const struct YuvConstants* yuvconstants,
2345                        int width) {
2346   uint8_t b0;
2347   uint8_t g0;
2348   uint8_t r0;
2349   uint8_t b1;
2350   uint8_t g1;
2351   uint8_t r1;
2352   int x;
2353   for (x = 0; x < width - 1; x += 2) {
2354     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2355     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
2356     b0 = b0 >> 3;
2357     g0 = g0 >> 2;
2358     r0 = r0 >> 3;
2359     b1 = b1 >> 3;
2360     g1 = g1 >> 2;
2361     r1 = r1 >> 3;
2362     *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11);  // for ubsan
2363     *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11);
2364     src_y += 2;
2365     src_u += 1;
2366     src_v += 1;
2367     dst_rgb565 += 4;  // Advance 2 pixels.
2368   }
2369   if (width & 1) {
2370     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2371     b0 = b0 >> 3;
2372     g0 = g0 >> 2;
2373     r0 = r0 >> 3;
2374     *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
2375   }
2376 }
2377 
NV12ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2378 void NV12ToARGBRow_C(const uint8_t* src_y,
2379                      const uint8_t* src_uv,
2380                      uint8_t* rgb_buf,
2381                      const struct YuvConstants* yuvconstants,
2382                      int width) {
2383   int x;
2384   for (x = 0; x < width - 1; x += 2) {
2385     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2386              rgb_buf + 2, yuvconstants);
2387     rgb_buf[3] = 255;
2388     YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
2389              rgb_buf + 6, yuvconstants);
2390     rgb_buf[7] = 255;
2391     src_y += 2;
2392     src_uv += 2;
2393     rgb_buf += 8;  // Advance 2 pixels.
2394   }
2395   if (width & 1) {
2396     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2397              rgb_buf + 2, yuvconstants);
2398     rgb_buf[3] = 255;
2399   }
2400 }
2401 
NV21ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2402 void NV21ToARGBRow_C(const uint8_t* src_y,
2403                      const uint8_t* src_vu,
2404                      uint8_t* rgb_buf,
2405                      const struct YuvConstants* yuvconstants,
2406                      int width) {
2407   int x;
2408   for (x = 0; x < width - 1; x += 2) {
2409     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2410              rgb_buf + 2, yuvconstants);
2411     rgb_buf[3] = 255;
2412     YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
2413              rgb_buf + 6, yuvconstants);
2414     rgb_buf[7] = 255;
2415     src_y += 2;
2416     src_vu += 2;
2417     rgb_buf += 8;  // Advance 2 pixels.
2418   }
2419   if (width & 1) {
2420     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2421              rgb_buf + 2, yuvconstants);
2422     rgb_buf[3] = 255;
2423   }
2424 }
2425 
NV12ToRGB24Row_C(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2426 void NV12ToRGB24Row_C(const uint8_t* src_y,
2427                       const uint8_t* src_uv,
2428                       uint8_t* rgb_buf,
2429                       const struct YuvConstants* yuvconstants,
2430                       int width) {
2431   int x;
2432   for (x = 0; x < width - 1; x += 2) {
2433     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2434              rgb_buf + 2, yuvconstants);
2435     YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
2436              rgb_buf + 5, yuvconstants);
2437     src_y += 2;
2438     src_uv += 2;
2439     rgb_buf += 6;  // Advance 2 pixels.
2440   }
2441   if (width & 1) {
2442     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2443              rgb_buf + 2, yuvconstants);
2444   }
2445 }
2446 
NV21ToRGB24Row_C(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2447 void NV21ToRGB24Row_C(const uint8_t* src_y,
2448                       const uint8_t* src_vu,
2449                       uint8_t* rgb_buf,
2450                       const struct YuvConstants* yuvconstants,
2451                       int width) {
2452   int x;
2453   for (x = 0; x < width - 1; x += 2) {
2454     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2455              rgb_buf + 2, yuvconstants);
2456     YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
2457              rgb_buf + 5, yuvconstants);
2458     src_y += 2;
2459     src_vu += 2;
2460     rgb_buf += 6;  // Advance 2 pixels.
2461   }
2462   if (width & 1) {
2463     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2464              rgb_buf + 2, yuvconstants);
2465   }
2466 }
2467 
NV12ToRGB565Row_C(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)2468 void NV12ToRGB565Row_C(const uint8_t* src_y,
2469                        const uint8_t* src_uv,
2470                        uint8_t* dst_rgb565,
2471                        const struct YuvConstants* yuvconstants,
2472                        int width) {
2473   uint8_t b0;
2474   uint8_t g0;
2475   uint8_t r0;
2476   uint8_t b1;
2477   uint8_t g1;
2478   uint8_t r1;
2479   int x;
2480   for (x = 0; x < width - 1; x += 2) {
2481     YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
2482     YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
2483     b0 = b0 >> 3;
2484     g0 = g0 >> 2;
2485     r0 = r0 >> 3;
2486     b1 = b1 >> 3;
2487     g1 = g1 >> 2;
2488     r1 = r1 >> 3;
2489     *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11);
2490     *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11);
2491     src_y += 2;
2492     src_uv += 2;
2493     dst_rgb565 += 4;  // Advance 2 pixels.
2494   }
2495   if (width & 1) {
2496     YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
2497     b0 = b0 >> 3;
2498     g0 = g0 >> 2;
2499     r0 = r0 >> 3;
2500     *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
2501   }
2502 }
2503 
YUY2ToARGBRow_C(const uint8_t * src_yuy2,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2504 void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
2505                      uint8_t* rgb_buf,
2506                      const struct YuvConstants* yuvconstants,
2507                      int width) {
2508   int x;
2509   for (x = 0; x < width - 1; x += 2) {
2510     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
2511              rgb_buf + 2, yuvconstants);
2512     rgb_buf[3] = 255;
2513     YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
2514              rgb_buf + 6, yuvconstants);
2515     rgb_buf[7] = 255;
2516     src_yuy2 += 4;
2517     rgb_buf += 8;  // Advance 2 pixels.
2518   }
2519   if (width & 1) {
2520     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
2521              rgb_buf + 2, yuvconstants);
2522     rgb_buf[3] = 255;
2523   }
2524 }
2525 
UYVYToARGBRow_C(const uint8_t * src_uyvy,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2526 void UYVYToARGBRow_C(const uint8_t* src_uyvy,
2527                      uint8_t* rgb_buf,
2528                      const struct YuvConstants* yuvconstants,
2529                      int width) {
2530   int x;
2531   for (x = 0; x < width - 1; x += 2) {
2532     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
2533              rgb_buf + 2, yuvconstants);
2534     rgb_buf[3] = 255;
2535     YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
2536              rgb_buf + 6, yuvconstants);
2537     rgb_buf[7] = 255;
2538     src_uyvy += 4;
2539     rgb_buf += 8;  // Advance 2 pixels.
2540   }
2541   if (width & 1) {
2542     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
2543              rgb_buf + 2, yuvconstants);
2544     rgb_buf[3] = 255;
2545   }
2546 }
2547 
I422ToRGBARow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2548 void I422ToRGBARow_C(const uint8_t* src_y,
2549                      const uint8_t* src_u,
2550                      const uint8_t* src_v,
2551                      uint8_t* rgb_buf,
2552                      const struct YuvConstants* yuvconstants,
2553                      int width) {
2554   int x;
2555   for (x = 0; x < width - 1; x += 2) {
2556     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
2557              rgb_buf + 3, yuvconstants);
2558     rgb_buf[0] = 255;
2559     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
2560              rgb_buf + 7, yuvconstants);
2561     rgb_buf[4] = 255;
2562     src_y += 2;
2563     src_u += 1;
2564     src_v += 1;
2565     rgb_buf += 8;  // Advance 2 pixels.
2566   }
2567   if (width & 1) {
2568     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
2569              rgb_buf + 3, yuvconstants);
2570     rgb_buf[0] = 255;
2571   }
2572 }
2573 
I400ToARGBRow_C(const uint8_t * src_y,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2574 void I400ToARGBRow_C(const uint8_t* src_y,
2575                      uint8_t* rgb_buf,
2576                      const struct YuvConstants* yuvconstants,
2577                      int width) {
2578   int x;
2579   for (x = 0; x < width - 1; x += 2) {
2580     YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
2581     rgb_buf[3] = 255;
2582     YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
2583     rgb_buf[7] = 255;
2584     src_y += 2;
2585     rgb_buf += 8;  // Advance 2 pixels.
2586   }
2587   if (width & 1) {
2588     YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
2589     rgb_buf[3] = 255;
2590   }
2591 }
2592 
MirrorRow_C(const uint8_t * src,uint8_t * dst,int width)2593 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
2594   int x;
2595   src += width - 1;
2596   for (x = 0; x < width - 1; x += 2) {
2597     dst[x] = src[0];
2598     dst[x + 1] = src[-1];
2599     src -= 2;
2600   }
2601   if (width & 1) {
2602     dst[width - 1] = src[0];
2603   }
2604 }
2605 
MirrorUVRow_C(const uint8_t * src_uv,uint8_t * dst_uv,int width)2606 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
2607   int x;
2608   src_uv += (width - 1) << 1;
2609   for (x = 0; x < width; ++x) {
2610     dst_uv[0] = src_uv[0];
2611     dst_uv[1] = src_uv[1];
2612     src_uv -= 2;
2613     dst_uv += 2;
2614   }
2615 }
2616 
MirrorSplitUVRow_C(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)2617 void MirrorSplitUVRow_C(const uint8_t* src_uv,
2618                         uint8_t* dst_u,
2619                         uint8_t* dst_v,
2620                         int width) {
2621   int x;
2622   src_uv += (width - 1) << 1;
2623   for (x = 0; x < width - 1; x += 2) {
2624     dst_u[x] = src_uv[0];
2625     dst_u[x + 1] = src_uv[-2];
2626     dst_v[x] = src_uv[1];
2627     dst_v[x + 1] = src_uv[-2 + 1];
2628     src_uv -= 4;
2629   }
2630   if (width & 1) {
2631     dst_u[width - 1] = src_uv[0];
2632     dst_v[width - 1] = src_uv[1];
2633   }
2634 }
2635 
ARGBMirrorRow_C(const uint8_t * src,uint8_t * dst,int width)2636 void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
2637   int x;
2638   const uint32_t* src32 = (const uint32_t*)(src);
2639   uint32_t* dst32 = (uint32_t*)(dst);
2640   src32 += width - 1;
2641   for (x = 0; x < width - 1; x += 2) {
2642     dst32[x] = src32[0];
2643     dst32[x + 1] = src32[-1];
2644     src32 -= 2;
2645   }
2646   if (width & 1) {
2647     dst32[width - 1] = src32[0];
2648   }
2649 }
2650 
RGB24MirrorRow_C(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)2651 void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
2652   int x;
2653   src_rgb24 += width * 3 - 3;
2654   for (x = 0; x < width; ++x) {
2655     uint8_t b = src_rgb24[0];
2656     uint8_t g = src_rgb24[1];
2657     uint8_t r = src_rgb24[2];
2658     dst_rgb24[0] = b;
2659     dst_rgb24[1] = g;
2660     dst_rgb24[2] = r;
2661     src_rgb24 -= 3;
2662     dst_rgb24 += 3;
2663   }
2664 }
2665 
SplitUVRow_C(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)2666 void SplitUVRow_C(const uint8_t* src_uv,
2667                   uint8_t* dst_u,
2668                   uint8_t* dst_v,
2669                   int width) {
2670   int x;
2671   for (x = 0; x < width - 1; x += 2) {
2672     dst_u[x] = src_uv[0];
2673     dst_u[x + 1] = src_uv[2];
2674     dst_v[x] = src_uv[1];
2675     dst_v[x + 1] = src_uv[3];
2676     src_uv += 4;
2677   }
2678   if (width & 1) {
2679     dst_u[width - 1] = src_uv[0];
2680     dst_v[width - 1] = src_uv[1];
2681   }
2682 }
2683 
MergeUVRow_C(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)2684 void MergeUVRow_C(const uint8_t* src_u,
2685                   const uint8_t* src_v,
2686                   uint8_t* dst_uv,
2687                   int width) {
2688   int x;
2689   for (x = 0; x < width - 1; x += 2) {
2690     dst_uv[0] = src_u[x];
2691     dst_uv[1] = src_v[x];
2692     dst_uv[2] = src_u[x + 1];
2693     dst_uv[3] = src_v[x + 1];
2694     dst_uv += 4;
2695   }
2696   if (width & 1) {
2697     dst_uv[0] = src_u[width - 1];
2698     dst_uv[1] = src_v[width - 1];
2699   }
2700 }
2701 
DetileRow_C(const uint8_t * src,ptrdiff_t src_tile_stride,uint8_t * dst,int width)2702 void DetileRow_C(const uint8_t* src,
2703                  ptrdiff_t src_tile_stride,
2704                  uint8_t* dst,
2705                  int width) {
2706   int x;
2707   for (x = 0; x < width - 15; x += 16) {
2708     memcpy(dst, src, 16);
2709     dst += 16;
2710     src += src_tile_stride;
2711   }
2712   if (width & 15) {
2713     memcpy(dst, src, width & 15);
2714   }
2715 }
2716 
DetileSplitUVRow_C(const uint8_t * src_uv,ptrdiff_t src_tile_stride,uint8_t * dst_u,uint8_t * dst_v,int width)2717 void DetileSplitUVRow_C(const uint8_t* src_uv,
2718                         ptrdiff_t src_tile_stride,
2719                         uint8_t* dst_u,
2720                         uint8_t* dst_v,
2721                         int width) {
2722   int x;
2723   for (x = 0; x < width - 15; x += 16) {
2724     SplitUVRow_C(src_uv, dst_u, dst_v, 8);
2725     dst_u += 8;
2726     dst_v += 8;
2727     src_uv += src_tile_stride;
2728   }
2729   if (width & 15) {
2730     SplitUVRow_C(src_uv, dst_u, dst_v, ((width & 15) + 1) / 2);
2731   }
2732 }
2733 
SplitRGBRow_C(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)2734 void SplitRGBRow_C(const uint8_t* src_rgb,
2735                    uint8_t* dst_r,
2736                    uint8_t* dst_g,
2737                    uint8_t* dst_b,
2738                    int width) {
2739   int x;
2740   for (x = 0; x < width; ++x) {
2741     dst_r[x] = src_rgb[0];
2742     dst_g[x] = src_rgb[1];
2743     dst_b[x] = src_rgb[2];
2744     src_rgb += 3;
2745   }
2746 }
2747 
MergeRGBRow_C(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)2748 void MergeRGBRow_C(const uint8_t* src_r,
2749                    const uint8_t* src_g,
2750                    const uint8_t* src_b,
2751                    uint8_t* dst_rgb,
2752                    int width) {
2753   int x;
2754   for (x = 0; x < width; ++x) {
2755     dst_rgb[0] = src_r[x];
2756     dst_rgb[1] = src_g[x];
2757     dst_rgb[2] = src_b[x];
2758     dst_rgb += 3;
2759   }
2760 }
2761 
SplitARGBRow_C(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)2762 void SplitARGBRow_C(const uint8_t* src_argb,
2763                     uint8_t* dst_r,
2764                     uint8_t* dst_g,
2765                     uint8_t* dst_b,
2766                     uint8_t* dst_a,
2767                     int width) {
2768   int x;
2769   for (x = 0; x < width; ++x) {
2770     dst_b[x] = src_argb[0];
2771     dst_g[x] = src_argb[1];
2772     dst_r[x] = src_argb[2];
2773     dst_a[x] = src_argb[3];
2774     src_argb += 4;
2775   }
2776 }
2777 
MergeARGBRow_C(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)2778 void MergeARGBRow_C(const uint8_t* src_r,
2779                     const uint8_t* src_g,
2780                     const uint8_t* src_b,
2781                     const uint8_t* src_a,
2782                     uint8_t* dst_argb,
2783                     int width) {
2784   int x;
2785   for (x = 0; x < width; ++x) {
2786     dst_argb[0] = src_b[x];
2787     dst_argb[1] = src_g[x];
2788     dst_argb[2] = src_r[x];
2789     dst_argb[3] = src_a[x];
2790     dst_argb += 4;
2791   }
2792 }
2793 
MergeXR30Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)2794 void MergeXR30Row_C(const uint16_t* src_r,
2795                     const uint16_t* src_g,
2796                     const uint16_t* src_b,
2797                     uint8_t* dst_ar30,
2798                     int depth,
2799                     int width) {
2800   assert(depth >= 10);
2801   assert(depth <= 16);
2802   int x;
2803   int shift = depth - 10;
2804   uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
2805   for (x = 0; x < width; ++x) {
2806     uint32_t r = clamp1023(src_r[x] >> shift);
2807     uint32_t g = clamp1023(src_g[x] >> shift);
2808     uint32_t b = clamp1023(src_b[x] >> shift);
2809     dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
2810   }
2811 }
2812 
MergeAR64Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)2813 void MergeAR64Row_C(const uint16_t* src_r,
2814                     const uint16_t* src_g,
2815                     const uint16_t* src_b,
2816                     const uint16_t* src_a,
2817                     uint16_t* dst_ar64,
2818                     int depth,
2819                     int width) {
2820   assert(depth >= 1);
2821   assert(depth <= 16);
2822   int x;
2823   int shift = 16 - depth;
2824   int max = (1 << depth) - 1;
2825   for (x = 0; x < width; ++x) {
2826     dst_ar64[0] = ClampMax(src_b[x], max) << shift;
2827     dst_ar64[1] = ClampMax(src_g[x], max) << shift;
2828     dst_ar64[2] = ClampMax(src_r[x], max) << shift;
2829     dst_ar64[3] = ClampMax(src_a[x], max) << shift;
2830     dst_ar64 += 4;
2831   }
2832 }
2833 
MergeARGB16To8Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)2834 void MergeARGB16To8Row_C(const uint16_t* src_r,
2835                          const uint16_t* src_g,
2836                          const uint16_t* src_b,
2837                          const uint16_t* src_a,
2838                          uint8_t* dst_argb,
2839                          int depth,
2840                          int width) {
2841   assert(depth >= 8);
2842   assert(depth <= 16);
2843   int x;
2844   int shift = depth - 8;
2845   for (x = 0; x < width; ++x) {
2846     dst_argb[0] = clamp255(src_b[x] >> shift);
2847     dst_argb[1] = clamp255(src_g[x] >> shift);
2848     dst_argb[2] = clamp255(src_r[x] >> shift);
2849     dst_argb[3] = clamp255(src_a[x] >> shift);
2850     dst_argb += 4;
2851   }
2852 }
2853 
MergeXR64Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)2854 void MergeXR64Row_C(const uint16_t* src_r,
2855                     const uint16_t* src_g,
2856                     const uint16_t* src_b,
2857                     uint16_t* dst_ar64,
2858                     int depth,
2859                     int width) {
2860   assert(depth >= 1);
2861   assert(depth <= 16);
2862   int x;
2863   int shift = 16 - depth;
2864   int max = (1 << depth) - 1;
2865   for (x = 0; x < width; ++x) {
2866     dst_ar64[0] = ClampMax(src_b[x], max) << shift;
2867     dst_ar64[1] = ClampMax(src_g[x], max) << shift;
2868     dst_ar64[2] = ClampMax(src_r[x], max) << shift;
2869     dst_ar64[3] = 0xffff;
2870     dst_ar64 += 4;
2871   }
2872 }
2873 
MergeXRGB16To8Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)2874 void MergeXRGB16To8Row_C(const uint16_t* src_r,
2875                          const uint16_t* src_g,
2876                          const uint16_t* src_b,
2877                          uint8_t* dst_argb,
2878                          int depth,
2879                          int width) {
2880   assert(depth >= 8);
2881   assert(depth <= 16);
2882   int x;
2883   int shift = depth - 8;
2884   for (x = 0; x < width; ++x) {
2885     dst_argb[0] = clamp255(src_b[x] >> shift);
2886     dst_argb[1] = clamp255(src_g[x] >> shift);
2887     dst_argb[2] = clamp255(src_r[x] >> shift);
2888     dst_argb[3] = 0xff;
2889     dst_argb += 4;
2890   }
2891 }
2892 
SplitXRGBRow_C(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)2893 void SplitXRGBRow_C(const uint8_t* src_argb,
2894                     uint8_t* dst_r,
2895                     uint8_t* dst_g,
2896                     uint8_t* dst_b,
2897                     int width) {
2898   int x;
2899   for (x = 0; x < width; ++x) {
2900     dst_b[x] = src_argb[0];
2901     dst_g[x] = src_argb[1];
2902     dst_r[x] = src_argb[2];
2903     src_argb += 4;
2904   }
2905 }
2906 
MergeXRGBRow_C(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)2907 void MergeXRGBRow_C(const uint8_t* src_r,
2908                     const uint8_t* src_g,
2909                     const uint8_t* src_b,
2910                     uint8_t* dst_argb,
2911                     int width) {
2912   int x;
2913   for (x = 0; x < width; ++x) {
2914     dst_argb[0] = src_b[x];
2915     dst_argb[1] = src_g[x];
2916     dst_argb[2] = src_r[x];
2917     dst_argb[3] = 255;
2918     dst_argb += 4;
2919   }
2920 }
2921 
2922 // Convert lsb formats to msb, depending on sample depth.
MergeUVRow_16_C(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)2923 void MergeUVRow_16_C(const uint16_t* src_u,
2924                      const uint16_t* src_v,
2925                      uint16_t* dst_uv,
2926                      int depth,
2927                      int width) {
2928   int shift = 16 - depth;
2929   assert(depth >= 8);
2930   assert(depth <= 16);
2931   int x;
2932   for (x = 0; x < width; ++x) {
2933     dst_uv[0] = src_u[x] << shift;
2934     dst_uv[1] = src_v[x] << shift;
2935     dst_uv += 2;
2936   }
2937 }
2938 
2939 // Convert msb formats to lsb, depending on sample depth.
SplitUVRow_16_C(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)2940 void SplitUVRow_16_C(const uint16_t* src_uv,
2941                      uint16_t* dst_u,
2942                      uint16_t* dst_v,
2943                      int depth,
2944                      int width) {
2945   int shift = 16 - depth;
2946   int x;
2947   assert(depth >= 8);
2948   assert(depth <= 16);
2949   for (x = 0; x < width; ++x) {
2950     dst_u[x] = src_uv[0] >> shift;
2951     dst_v[x] = src_uv[1] >> shift;
2952     src_uv += 2;
2953   }
2954 }
2955 
MultiplyRow_16_C(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)2956 void MultiplyRow_16_C(const uint16_t* src_y,
2957                       uint16_t* dst_y,
2958                       int scale,
2959                       int width) {
2960   int x;
2961   for (x = 0; x < width; ++x) {
2962     dst_y[x] = src_y[x] * scale;
2963   }
2964 }
2965 
DivideRow_16_C(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)2966 void DivideRow_16_C(const uint16_t* src_y,
2967                     uint16_t* dst_y,
2968                     int scale,
2969                     int width) {
2970   int x;
2971   for (x = 0; x < width; ++x) {
2972     dst_y[x] = (src_y[x] * scale) >> 16;
2973   }
2974 }
2975 
2976 // Use scale to convert lsb formats to msb, depending how many bits there are:
2977 // 32768 = 9 bits
2978 // 16384 = 10 bits
2979 // 4096 = 12 bits
2980 // 256 = 16 bits
2981 // TODO(fbarchard): change scale to bits
2982 #define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
2983 
Convert16To8Row_C(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)2984 void Convert16To8Row_C(const uint16_t* src_y,
2985                        uint8_t* dst_y,
2986                        int scale,
2987                        int width) {
2988   int x;
2989   assert(scale >= 256);
2990   assert(scale <= 32768);
2991 
2992   for (x = 0; x < width; ++x) {
2993     dst_y[x] = C16TO8(src_y[x], scale);
2994   }
2995 }
2996 
2997 // Use scale to convert lsb formats to msb, depending how many bits there are:
2998 // 1024 = 10 bits
Convert8To16Row_C(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)2999 void Convert8To16Row_C(const uint8_t* src_y,
3000                        uint16_t* dst_y,
3001                        int scale,
3002                        int width) {
3003   int x;
3004   scale *= 0x0101;  // replicates the byte.
3005   for (x = 0; x < width; ++x) {
3006     dst_y[x] = (src_y[x] * scale) >> 16;
3007   }
3008 }
3009 
CopyRow_C(const uint8_t * src,uint8_t * dst,int count)3010 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
3011   memcpy(dst, src, count);
3012 }
3013 
CopyRow_16_C(const uint16_t * src,uint16_t * dst,int count)3014 void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {
3015   memcpy(dst, src, count * 2);
3016 }
3017 
SetRow_C(uint8_t * dst,uint8_t v8,int width)3018 void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
3019   memset(dst, v8, width);
3020 }
3021 
ARGBSetRow_C(uint8_t * dst_argb,uint32_t v32,int width)3022 void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
3023   int x;
3024   for (x = 0; x < width; ++x) {
3025     memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
3026   }
3027 }
3028 
3029 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
YUY2ToUVRow_C(const uint8_t * src_yuy2,int src_stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)3030 void YUY2ToUVRow_C(const uint8_t* src_yuy2,
3031                    int src_stride_yuy2,
3032                    uint8_t* dst_u,
3033                    uint8_t* dst_v,
3034                    int width) {
3035   // Output a row of UV values, filtering 2 rows of YUY2.
3036   int x;
3037   for (x = 0; x < width; x += 2) {
3038     dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
3039     dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
3040     src_yuy2 += 4;
3041     dst_u += 1;
3042     dst_v += 1;
3043   }
3044 }
3045 
3046 // Copy row of YUY2 UV's (422) into U and V (422).
YUY2ToUV422Row_C(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)3047 void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
3048                       uint8_t* dst_u,
3049                       uint8_t* dst_v,
3050                       int width) {
3051   // Output a row of UV values.
3052   int x;
3053   for (x = 0; x < width; x += 2) {
3054     dst_u[0] = src_yuy2[1];
3055     dst_v[0] = src_yuy2[3];
3056     src_yuy2 += 4;
3057     dst_u += 1;
3058     dst_v += 1;
3059   }
3060 }
3061 
3062 // Copy row of YUY2 Y's (422) into Y (420/422).
YUY2ToYRow_C(const uint8_t * src_yuy2,uint8_t * dst_y,int width)3063 void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
3064   // Output a row of Y values.
3065   int x;
3066   for (x = 0; x < width - 1; x += 2) {
3067     dst_y[x] = src_yuy2[0];
3068     dst_y[x + 1] = src_yuy2[2];
3069     src_yuy2 += 4;
3070   }
3071   if (width & 1) {
3072     dst_y[width - 1] = src_yuy2[0];
3073   }
3074 }
3075 
3076 // Filter 2 rows of UYVY UV's (422) into U and V (420).
UYVYToUVRow_C(const uint8_t * src_uyvy,int src_stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)3077 void UYVYToUVRow_C(const uint8_t* src_uyvy,
3078                    int src_stride_uyvy,
3079                    uint8_t* dst_u,
3080                    uint8_t* dst_v,
3081                    int width) {
3082   // Output a row of UV values.
3083   int x;
3084   for (x = 0; x < width; x += 2) {
3085     dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
3086     dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
3087     src_uyvy += 4;
3088     dst_u += 1;
3089     dst_v += 1;
3090   }
3091 }
3092 
3093 // Copy row of UYVY UV's (422) into U and V (422).
UYVYToUV422Row_C(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)3094 void UYVYToUV422Row_C(const uint8_t* src_uyvy,
3095                       uint8_t* dst_u,
3096                       uint8_t* dst_v,
3097                       int width) {
3098   // Output a row of UV values.
3099   int x;
3100   for (x = 0; x < width; x += 2) {
3101     dst_u[0] = src_uyvy[0];
3102     dst_v[0] = src_uyvy[2];
3103     src_uyvy += 4;
3104     dst_u += 1;
3105     dst_v += 1;
3106   }
3107 }
3108 
3109 // Copy row of UYVY Y's (422) into Y (420/422).
UYVYToYRow_C(const uint8_t * src_uyvy,uint8_t * dst_y,int width)3110 void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
3111   // Output a row of Y values.
3112   int x;
3113   for (x = 0; x < width - 1; x += 2) {
3114     dst_y[x] = src_uyvy[1];
3115     dst_y[x + 1] = src_uyvy[3];
3116     src_uyvy += 4;
3117   }
3118   if (width & 1) {
3119     dst_y[width - 1] = src_uyvy[1];
3120   }
3121 }
3122 
3123 #define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
3124 
3125 // Blend src_argb over src_argb1 and store to dst_argb.
3126 // dst_argb may be src_argb or src_argb1.
3127 // This code mimics the SSSE3 version for better testability.
ARGBBlendRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3128 void ARGBBlendRow_C(const uint8_t* src_argb,
3129                     const uint8_t* src_argb1,
3130                     uint8_t* dst_argb,
3131                     int width) {
3132   int x;
3133   for (x = 0; x < width - 1; x += 2) {
3134     uint32_t fb = src_argb[0];
3135     uint32_t fg = src_argb[1];
3136     uint32_t fr = src_argb[2];
3137     uint32_t a = src_argb[3];
3138     uint32_t bb = src_argb1[0];
3139     uint32_t bg = src_argb1[1];
3140     uint32_t br = src_argb1[2];
3141     dst_argb[0] = BLEND(fb, bb, a);
3142     dst_argb[1] = BLEND(fg, bg, a);
3143     dst_argb[2] = BLEND(fr, br, a);
3144     dst_argb[3] = 255u;
3145 
3146     fb = src_argb[4 + 0];
3147     fg = src_argb[4 + 1];
3148     fr = src_argb[4 + 2];
3149     a = src_argb[4 + 3];
3150     bb = src_argb1[4 + 0];
3151     bg = src_argb1[4 + 1];
3152     br = src_argb1[4 + 2];
3153     dst_argb[4 + 0] = BLEND(fb, bb, a);
3154     dst_argb[4 + 1] = BLEND(fg, bg, a);
3155     dst_argb[4 + 2] = BLEND(fr, br, a);
3156     dst_argb[4 + 3] = 255u;
3157     src_argb += 8;
3158     src_argb1 += 8;
3159     dst_argb += 8;
3160   }
3161 
3162   if (width & 1) {
3163     uint32_t fb = src_argb[0];
3164     uint32_t fg = src_argb[1];
3165     uint32_t fr = src_argb[2];
3166     uint32_t a = src_argb[3];
3167     uint32_t bb = src_argb1[0];
3168     uint32_t bg = src_argb1[1];
3169     uint32_t br = src_argb1[2];
3170     dst_argb[0] = BLEND(fb, bb, a);
3171     dst_argb[1] = BLEND(fg, bg, a);
3172     dst_argb[2] = BLEND(fr, br, a);
3173     dst_argb[3] = 255u;
3174   }
3175 }
3176 #undef BLEND
3177 
3178 #define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
BlendPlaneRow_C(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)3179 void BlendPlaneRow_C(const uint8_t* src0,
3180                      const uint8_t* src1,
3181                      const uint8_t* alpha,
3182                      uint8_t* dst,
3183                      int width) {
3184   int x;
3185   for (x = 0; x < width - 1; x += 2) {
3186     dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
3187     dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
3188     src0 += 2;
3189     src1 += 2;
3190     alpha += 2;
3191     dst += 2;
3192   }
3193   if (width & 1) {
3194     dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
3195   }
3196 }
3197 #undef UBLEND
3198 
3199 #if LIBYUV_ATTENUATE_DUP
3200 // This code mimics the SSSE3 version for better testability.
3201 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
3202 #else
3203 #define ATTENUATE(f, a) (f * a + 128) >> 8
3204 #endif
3205 
3206 // Multiply source RGB by alpha and store to destination.
ARGBAttenuateRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width)3207 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
3208   int i;
3209   for (i = 0; i < width - 1; i += 2) {
3210     uint32_t b = src_argb[0];
3211     uint32_t g = src_argb[1];
3212     uint32_t r = src_argb[2];
3213     uint32_t a = src_argb[3];
3214     dst_argb[0] = ATTENUATE(b, a);
3215     dst_argb[1] = ATTENUATE(g, a);
3216     dst_argb[2] = ATTENUATE(r, a);
3217     dst_argb[3] = a;
3218     b = src_argb[4];
3219     g = src_argb[5];
3220     r = src_argb[6];
3221     a = src_argb[7];
3222     dst_argb[4] = ATTENUATE(b, a);
3223     dst_argb[5] = ATTENUATE(g, a);
3224     dst_argb[6] = ATTENUATE(r, a);
3225     dst_argb[7] = a;
3226     src_argb += 8;
3227     dst_argb += 8;
3228   }
3229 
3230   if (width & 1) {
3231     const uint32_t b = src_argb[0];
3232     const uint32_t g = src_argb[1];
3233     const uint32_t r = src_argb[2];
3234     const uint32_t a = src_argb[3];
3235     dst_argb[0] = ATTENUATE(b, a);
3236     dst_argb[1] = ATTENUATE(g, a);
3237     dst_argb[2] = ATTENUATE(r, a);
3238     dst_argb[3] = a;
3239   }
3240 }
3241 #undef ATTENUATE
3242 
3243 // Divide source RGB by alpha and store to destination.
3244 // b = (b * 255 + (a / 2)) / a;
3245 // g = (g * 255 + (a / 2)) / a;
3246 // r = (r * 255 + (a / 2)) / a;
3247 // Reciprocal method is off by 1 on some values. ie 125
3248 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
3249 #define T(a) 0x01000000 + (0x10000 / a)
3250 const uint32_t fixed_invtbl8[256] = {
3251     0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),
3252     T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),
3253     T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),
3254     T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),
3255     T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),
3256     T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),
3257     T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),
3258     T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),
3259     T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),
3260     T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),
3261     T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),
3262     T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),
3263     T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),
3264     T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),
3265     T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),
3266     T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),
3267     T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),
3268     T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),
3269     T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),
3270     T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),
3271     T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),
3272     T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),
3273     T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),
3274     T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),
3275     T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),
3276     T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),
3277     T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),
3278     T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),
3279     T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),
3280     T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),
3281     T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),
3282     T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),
3283     T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),
3284     T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),
3285     T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),
3286     T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),
3287     T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
3288 #undef T
3289 
3290 #if LIBYUV_UNATTENUATE_DUP
3291 // This code mimics the Intel SIMD version for better testability.
3292 #define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16)
3293 #else
3294 #define UNATTENUATE(f, ia) clamp255((f * ia) >> 8)
3295 #endif
3296 
3297 // mimics the Intel SIMD code for exactness.
ARGBUnattenuateRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width)3298 void ARGBUnattenuateRow_C(const uint8_t* src_argb,
3299                           uint8_t* dst_argb,
3300                           int width) {
3301   int i;
3302   for (i = 0; i < width; ++i) {
3303     uint32_t b = src_argb[0];
3304     uint32_t g = src_argb[1];
3305     uint32_t r = src_argb[2];
3306     const uint32_t a = src_argb[3];
3307     const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
3308 
3309     // Clamping should not be necessary but is free in assembly.
3310     dst_argb[0] = UNATTENUATE(b, ia);
3311     dst_argb[1] = UNATTENUATE(g, ia);
3312     dst_argb[2] = UNATTENUATE(r, ia);
3313     dst_argb[3] = a;
3314     src_argb += 4;
3315     dst_argb += 4;
3316   }
3317 }
3318 
ComputeCumulativeSumRow_C(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)3319 void ComputeCumulativeSumRow_C(const uint8_t* row,
3320                                int32_t* cumsum,
3321                                const int32_t* previous_cumsum,
3322                                int width) {
3323   int32_t row_sum[4] = {0, 0, 0, 0};
3324   int x;
3325   for (x = 0; x < width; ++x) {
3326     row_sum[0] += row[x * 4 + 0];
3327     row_sum[1] += row[x * 4 + 1];
3328     row_sum[2] += row[x * 4 + 2];
3329     row_sum[3] += row[x * 4 + 3];
3330     cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
3331     cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
3332     cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
3333     cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
3334   }
3335 }
3336 
CumulativeSumToAverageRow_C(const int32_t * tl,const int32_t * bl,int w,int area,uint8_t * dst,int count)3337 void CumulativeSumToAverageRow_C(const int32_t* tl,
3338                                  const int32_t* bl,
3339                                  int w,
3340                                  int area,
3341                                  uint8_t* dst,
3342                                  int count) {
3343   float ooa;
3344   int i;
3345   assert(area != 0);
3346 
3347   ooa = 1.0f / area;
3348   for (i = 0; i < count; ++i) {
3349     dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
3350     dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
3351     dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
3352     dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
3353     dst += 4;
3354     tl += 4;
3355     bl += 4;
3356   }
3357 }
3358 
3359 // Copy pixels from rotated source to destination row with a slope.
3360 LIBYUV_API
ARGBAffineRow_C(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * uv_dudv,int width)3361 void ARGBAffineRow_C(const uint8_t* src_argb,
3362                      int src_argb_stride,
3363                      uint8_t* dst_argb,
3364                      const float* uv_dudv,
3365                      int width) {
3366   int i;
3367   // Render a row of pixels from source into a buffer.
3368   float uv[2];
3369   uv[0] = uv_dudv[0];
3370   uv[1] = uv_dudv[1];
3371   for (i = 0; i < width; ++i) {
3372     int x = (int)(uv[0]);
3373     int y = (int)(uv[1]);
3374     *(uint32_t*)(dst_argb) =
3375         *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
3376     dst_argb += 4;
3377     uv[0] += uv_dudv[2];
3378     uv[1] += uv_dudv[3];
3379   }
3380 }
3381 
3382 // Blend 2 rows into 1.
HalfRow_C(const uint8_t * src_uv,ptrdiff_t src_uv_stride,uint8_t * dst_uv,int width)3383 static void HalfRow_C(const uint8_t* src_uv,
3384                       ptrdiff_t src_uv_stride,
3385                       uint8_t* dst_uv,
3386                       int width) {
3387   int x;
3388   for (x = 0; x < width; ++x) {
3389     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
3390   }
3391 }
3392 
HalfRow_16_C(const uint16_t * src_uv,ptrdiff_t src_uv_stride,uint16_t * dst_uv,int width)3393 static void HalfRow_16_C(const uint16_t* src_uv,
3394                          ptrdiff_t src_uv_stride,
3395                          uint16_t* dst_uv,
3396                          int width) {
3397   int x;
3398   for (x = 0; x < width; ++x) {
3399     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
3400   }
3401 }
3402 
HalfRow_16To8_C(const uint16_t * src_uv,ptrdiff_t src_uv_stride,uint8_t * dst_uv,int scale,int width)3403 static void HalfRow_16To8_C(const uint16_t* src_uv,
3404                             ptrdiff_t src_uv_stride,
3405                             uint8_t* dst_uv,
3406                             int scale,
3407                             int width) {
3408   int x;
3409   for (x = 0; x < width; ++x) {
3410     dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale);
3411   }
3412 }
3413 
3414 // C version 2x2 -> 2x1.
InterpolateRow_C(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)3415 void InterpolateRow_C(uint8_t* dst_ptr,
3416                       const uint8_t* src_ptr,
3417                       ptrdiff_t src_stride,
3418                       int width,
3419                       int source_y_fraction) {
3420   int y1_fraction = source_y_fraction;
3421   int y0_fraction = 256 - y1_fraction;
3422   const uint8_t* src_ptr1 = src_ptr + src_stride;
3423   int x;
3424   assert(source_y_fraction >= 0);
3425   assert(source_y_fraction < 256);
3426 
3427   if (y1_fraction == 0) {
3428     memcpy(dst_ptr, src_ptr, width);
3429     return;
3430   }
3431   if (y1_fraction == 128) {
3432     HalfRow_C(src_ptr, src_stride, dst_ptr, width);
3433     return;
3434   }
3435   for (x = 0; x < width; ++x) {
3436     dst_ptr[0] =
3437         (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
3438     ++src_ptr;
3439     ++src_ptr1;
3440     ++dst_ptr;
3441   }
3442 }
3443 
3444 // C version 2x2 -> 2x1.
InterpolateRow_16_C(uint16_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)3445 void InterpolateRow_16_C(uint16_t* dst_ptr,
3446                          const uint16_t* src_ptr,
3447                          ptrdiff_t src_stride,
3448                          int width,
3449                          int source_y_fraction) {
3450   int y1_fraction = source_y_fraction;
3451   int y0_fraction = 256 - y1_fraction;
3452   const uint16_t* src_ptr1 = src_ptr + src_stride;
3453   int x;
3454   assert(source_y_fraction >= 0);
3455   assert(source_y_fraction < 256);
3456 
3457   if (y1_fraction == 0) {
3458     memcpy(dst_ptr, src_ptr, width * 2);
3459     return;
3460   }
3461   if (y1_fraction == 128) {
3462     HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
3463     return;
3464   }
3465   for (x = 0; x < width; ++x) {
3466     dst_ptr[0] =
3467         (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
3468     ++src_ptr;
3469     ++src_ptr1;
3470     ++dst_ptr;
3471   }
3472 }
3473 
3474 // C version 2x2 16 bit-> 2x1 8 bit.
3475 // Use scale to convert lsb formats to msb, depending how many bits there are:
3476 // 32768 = 9 bits
3477 // 16384 = 10 bits
3478 // 4096 = 12 bits
3479 // 256 = 16 bits
3480 // TODO(fbarchard): change scale to bits
3481 
InterpolateRow_16To8_C(uint8_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int scale,int width,int source_y_fraction)3482 void InterpolateRow_16To8_C(uint8_t* dst_ptr,
3483                             const uint16_t* src_ptr,
3484                             ptrdiff_t src_stride,
3485                             int scale,
3486                             int width,
3487                             int source_y_fraction) {
3488   int y1_fraction = source_y_fraction;
3489   int y0_fraction = 256 - y1_fraction;
3490   const uint16_t* src_ptr1 = src_ptr + src_stride;
3491   int x;
3492   assert(source_y_fraction >= 0);
3493   assert(source_y_fraction < 256);
3494 
3495   if (source_y_fraction == 0) {
3496     Convert16To8Row_C(src_ptr, dst_ptr, scale, width);
3497     return;
3498   }
3499   if (source_y_fraction == 128) {
3500     HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width);
3501     return;
3502   }
3503   for (x = 0; x < width; ++x) {
3504     dst_ptr[0] = C16TO8(
3505         (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
3506         scale);
3507     src_ptr += 1;
3508     src_ptr1 += 1;
3509     dst_ptr += 1;
3510   }
3511 }
3512 
3513 // Use first 4 shuffler values to reorder ARGB channels.
ARGBShuffleRow_C(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)3514 void ARGBShuffleRow_C(const uint8_t* src_argb,
3515                       uint8_t* dst_argb,
3516                       const uint8_t* shuffler,
3517                       int width) {
3518   int index0 = shuffler[0];
3519   int index1 = shuffler[1];
3520   int index2 = shuffler[2];
3521   int index3 = shuffler[3];
3522   // Shuffle a row of ARGB.
3523   int x;
3524   for (x = 0; x < width; ++x) {
3525     // To support in-place conversion.
3526     uint8_t b = src_argb[index0];
3527     uint8_t g = src_argb[index1];
3528     uint8_t r = src_argb[index2];
3529     uint8_t a = src_argb[index3];
3530     dst_argb[0] = b;
3531     dst_argb[1] = g;
3532     dst_argb[2] = r;
3533     dst_argb[3] = a;
3534     src_argb += 4;
3535     dst_argb += 4;
3536   }
3537 }
3538 
I422ToYUY2Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_frame,int width)3539 void I422ToYUY2Row_C(const uint8_t* src_y,
3540                      const uint8_t* src_u,
3541                      const uint8_t* src_v,
3542                      uint8_t* dst_frame,
3543                      int width) {
3544   int x;
3545   for (x = 0; x < width - 1; x += 2) {
3546     dst_frame[0] = src_y[0];
3547     dst_frame[1] = src_u[0];
3548     dst_frame[2] = src_y[1];
3549     dst_frame[3] = src_v[0];
3550     dst_frame += 4;
3551     src_y += 2;
3552     src_u += 1;
3553     src_v += 1;
3554   }
3555   if (width & 1) {
3556     dst_frame[0] = src_y[0];
3557     dst_frame[1] = src_u[0];
3558     dst_frame[2] = 0;
3559     dst_frame[3] = src_v[0];
3560   }
3561 }
3562 
I422ToUYVYRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_frame,int width)3563 void I422ToUYVYRow_C(const uint8_t* src_y,
3564                      const uint8_t* src_u,
3565                      const uint8_t* src_v,
3566                      uint8_t* dst_frame,
3567                      int width) {
3568   int x;
3569   for (x = 0; x < width - 1; x += 2) {
3570     dst_frame[0] = src_u[0];
3571     dst_frame[1] = src_y[0];
3572     dst_frame[2] = src_v[0];
3573     dst_frame[3] = src_y[1];
3574     dst_frame += 4;
3575     src_y += 2;
3576     src_u += 1;
3577     src_v += 1;
3578   }
3579   if (width & 1) {
3580     dst_frame[0] = src_u[0];
3581     dst_frame[1] = src_y[0];
3582     dst_frame[2] = src_v[0];
3583     dst_frame[3] = 0;
3584   }
3585 }
3586 
ARGBPolynomialRow_C(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)3587 void ARGBPolynomialRow_C(const uint8_t* src_argb,
3588                          uint8_t* dst_argb,
3589                          const float* poly,
3590                          int width) {
3591   int i;
3592   for (i = 0; i < width; ++i) {
3593     float b = (float)(src_argb[0]);
3594     float g = (float)(src_argb[1]);
3595     float r = (float)(src_argb[2]);
3596     float a = (float)(src_argb[3]);
3597     float b2 = b * b;
3598     float g2 = g * g;
3599     float r2 = r * r;
3600     float a2 = a * a;
3601     float db = poly[0] + poly[4] * b;
3602     float dg = poly[1] + poly[5] * g;
3603     float dr = poly[2] + poly[6] * r;
3604     float da = poly[3] + poly[7] * a;
3605     float b3 = b2 * b;
3606     float g3 = g2 * g;
3607     float r3 = r2 * r;
3608     float a3 = a2 * a;
3609     db += poly[8] * b2;
3610     dg += poly[9] * g2;
3611     dr += poly[10] * r2;
3612     da += poly[11] * a2;
3613     db += poly[12] * b3;
3614     dg += poly[13] * g3;
3615     dr += poly[14] * r3;
3616     da += poly[15] * a3;
3617 
3618     dst_argb[0] = Clamp((int32_t)(db));
3619     dst_argb[1] = Clamp((int32_t)(dg));
3620     dst_argb[2] = Clamp((int32_t)(dr));
3621     dst_argb[3] = Clamp((int32_t)(da));
3622     src_argb += 4;
3623     dst_argb += 4;
3624   }
3625 }
3626 
3627 // Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
3628 // adjust the source integer range to the half float range desired.
3629 
3630 // This magic constant is 2^-112. Multiplying by this
3631 // is the same as subtracting 112 from the exponent, which
3632 // is the difference in exponent bias between 32-bit and
3633 // 16-bit floats. Once we've done this subtraction, we can
3634 // simply extract the low bits of the exponent and the high
3635 // bits of the mantissa from our float and we're done.
3636 
3637 // Work around GCC 7 punning warning -Wstrict-aliasing
3638 #if defined(__GNUC__)
3639 typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
3640 #else
3641 typedef uint32_t uint32_alias_t;
3642 #endif
3643 
HalfFloatRow_C(const uint16_t * src,uint16_t * dst,float scale,int width)3644 void HalfFloatRow_C(const uint16_t* src,
3645                     uint16_t* dst,
3646                     float scale,
3647                     int width) {
3648   int i;
3649   float mult = 1.9259299444e-34f * scale;
3650   for (i = 0; i < width; ++i) {
3651     float value = src[i] * mult;
3652     dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13);
3653   }
3654 }
3655 
ByteToFloatRow_C(const uint8_t * src,float * dst,float scale,int width)3656 void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
3657   int i;
3658   for (i = 0; i < width; ++i) {
3659     float value = src[i] * scale;
3660     dst[i] = value;
3661   }
3662 }
3663 
ARGBLumaColorTableRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)3664 void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
3665                              uint8_t* dst_argb,
3666                              int width,
3667                              const uint8_t* luma,
3668                              uint32_t lumacoeff) {
3669   uint32_t bc = lumacoeff & 0xff;
3670   uint32_t gc = (lumacoeff >> 8) & 0xff;
3671   uint32_t rc = (lumacoeff >> 16) & 0xff;
3672 
3673   int i;
3674   for (i = 0; i < width - 1; i += 2) {
3675     // Luminance in rows, color values in columns.
3676     const uint8_t* luma0 =
3677         ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
3678         luma;
3679     const uint8_t* luma1;
3680     dst_argb[0] = luma0[src_argb[0]];
3681     dst_argb[1] = luma0[src_argb[1]];
3682     dst_argb[2] = luma0[src_argb[2]];
3683     dst_argb[3] = src_argb[3];
3684     luma1 =
3685         ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
3686         luma;
3687     dst_argb[4] = luma1[src_argb[4]];
3688     dst_argb[5] = luma1[src_argb[5]];
3689     dst_argb[6] = luma1[src_argb[6]];
3690     dst_argb[7] = src_argb[7];
3691     src_argb += 8;
3692     dst_argb += 8;
3693   }
3694   if (width & 1) {
3695     // Luminance in rows, color values in columns.
3696     const uint8_t* luma0 =
3697         ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
3698         luma;
3699     dst_argb[0] = luma0[src_argb[0]];
3700     dst_argb[1] = luma0[src_argb[1]];
3701     dst_argb[2] = luma0[src_argb[2]];
3702     dst_argb[3] = src_argb[3];
3703   }
3704 }
3705 
ARGBCopyAlphaRow_C(const uint8_t * src,uint8_t * dst,int width)3706 void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
3707   int i;
3708   for (i = 0; i < width - 1; i += 2) {
3709     dst[3] = src[3];
3710     dst[7] = src[7];
3711     dst += 8;
3712     src += 8;
3713   }
3714   if (width & 1) {
3715     dst[3] = src[3];
3716   }
3717 }
3718 
ARGBExtractAlphaRow_C(const uint8_t * src_argb,uint8_t * dst_a,int width)3719 void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {
3720   int i;
3721   for (i = 0; i < width - 1; i += 2) {
3722     dst_a[0] = src_argb[3];
3723     dst_a[1] = src_argb[7];
3724     dst_a += 2;
3725     src_argb += 8;
3726   }
3727   if (width & 1) {
3728     dst_a[0] = src_argb[3];
3729   }
3730 }
3731 
ARGBCopyYToAlphaRow_C(const uint8_t * src,uint8_t * dst,int width)3732 void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
3733   int i;
3734   for (i = 0; i < width - 1; i += 2) {
3735     dst[3] = src[0];
3736     dst[7] = src[1];
3737     dst += 8;
3738     src += 2;
3739   }
3740   if (width & 1) {
3741     dst[3] = src[0];
3742   }
3743 }
3744 
3745 // Maximum temporary width for wrappers to process at a time, in pixels.
3746 #define MAXTWIDTH 2048
3747 
3748 #if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \
3749     defined(HAS_I422TORGB565ROW_SSSE3)
3750 // row_win.cc has asm version, but GCC uses 2 step wrapper.
I422ToRGB565Row_SSSE3(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)3751 void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
3752                            const uint8_t* src_u,
3753                            const uint8_t* src_v,
3754                            uint8_t* dst_rgb565,
3755                            const struct YuvConstants* yuvconstants,
3756                            int width) {
3757   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3758   while (width > 0) {
3759     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3760     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
3761     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
3762     src_y += twidth;
3763     src_u += twidth / 2;
3764     src_v += twidth / 2;
3765     dst_rgb565 += twidth * 2;
3766     width -= twidth;
3767   }
3768 }
3769 #endif
3770 
3771 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
I422ToARGB1555Row_SSSE3(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)3772 void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
3773                              const uint8_t* src_u,
3774                              const uint8_t* src_v,
3775                              uint8_t* dst_argb1555,
3776                              const struct YuvConstants* yuvconstants,
3777                              int width) {
3778   // Row buffer for intermediate ARGB pixels.
3779   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3780   while (width > 0) {
3781     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3782     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
3783     ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
3784     src_y += twidth;
3785     src_u += twidth / 2;
3786     src_v += twidth / 2;
3787     dst_argb1555 += twidth * 2;
3788     width -= twidth;
3789   }
3790 }
3791 #endif
3792 
3793 #if defined(HAS_I422TOARGB4444ROW_SSSE3)
I422ToARGB4444Row_SSSE3(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)3794 void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
3795                              const uint8_t* src_u,
3796                              const uint8_t* src_v,
3797                              uint8_t* dst_argb4444,
3798                              const struct YuvConstants* yuvconstants,
3799                              int width) {
3800   // Row buffer for intermediate ARGB pixels.
3801   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3802   while (width > 0) {
3803     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3804     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
3805     ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
3806     src_y += twidth;
3807     src_u += twidth / 2;
3808     src_v += twidth / 2;
3809     dst_argb4444 += twidth * 2;
3810     width -= twidth;
3811   }
3812 }
3813 #endif
3814 
3815 #if defined(HAS_NV12TORGB565ROW_SSSE3)
NV12ToRGB565Row_SSSE3(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)3816 void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
3817                            const uint8_t* src_uv,
3818                            uint8_t* dst_rgb565,
3819                            const struct YuvConstants* yuvconstants,
3820                            int width) {
3821   // Row buffer for intermediate ARGB pixels.
3822   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3823   while (width > 0) {
3824     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3825     NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
3826     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
3827     src_y += twidth;
3828     src_uv += twidth;
3829     dst_rgb565 += twidth * 2;
3830     width -= twidth;
3831   }
3832 }
3833 #endif
3834 
3835 #if defined(HAS_NV12TORGB24ROW_SSSE3)
NV12ToRGB24Row_SSSE3(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3836 void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
3837                           const uint8_t* src_uv,
3838                           uint8_t* dst_rgb24,
3839                           const struct YuvConstants* yuvconstants,
3840                           int width) {
3841   // Row buffer for intermediate ARGB pixels.
3842   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3843   while (width > 0) {
3844     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3845     NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
3846     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3847     src_y += twidth;
3848     src_uv += twidth;
3849     dst_rgb24 += twidth * 3;
3850     width -= twidth;
3851   }
3852 }
3853 #endif
3854 
3855 #if defined(HAS_NV21TORGB24ROW_SSSE3)
NV21ToRGB24Row_SSSE3(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3856 void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
3857                           const uint8_t* src_vu,
3858                           uint8_t* dst_rgb24,
3859                           const struct YuvConstants* yuvconstants,
3860                           int width) {
3861   // Row buffer for intermediate ARGB pixels.
3862   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3863   while (width > 0) {
3864     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3865     NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);
3866     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3867     src_y += twidth;
3868     src_vu += twidth;
3869     dst_rgb24 += twidth * 3;
3870     width -= twidth;
3871   }
3872 }
3873 #endif
3874 
3875 #if defined(HAS_NV12TORGB24ROW_AVX2)
NV12ToRGB24Row_AVX2(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3876 void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
3877                          const uint8_t* src_uv,
3878                          uint8_t* dst_rgb24,
3879                          const struct YuvConstants* yuvconstants,
3880                          int width) {
3881   // Row buffer for intermediate ARGB pixels.
3882   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3883   while (width > 0) {
3884     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3885     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
3886 #if defined(HAS_ARGBTORGB24ROW_AVX2)
3887     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
3888 #else
3889     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3890 #endif
3891     src_y += twidth;
3892     src_uv += twidth;
3893     dst_rgb24 += twidth * 3;
3894     width -= twidth;
3895   }
3896 }
3897 #endif
3898 
3899 #if defined(HAS_NV21TORGB24ROW_AVX2)
NV21ToRGB24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3900 void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
3901                          const uint8_t* src_vu,
3902                          uint8_t* dst_rgb24,
3903                          const struct YuvConstants* yuvconstants,
3904                          int width) {
3905   // Row buffer for intermediate ARGB pixels.
3906   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3907   while (width > 0) {
3908     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3909     NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
3910 #if defined(HAS_ARGBTORGB24ROW_AVX2)
3911     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
3912 #else
3913     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3914 #endif
3915     src_y += twidth;
3916     src_vu += twidth;
3917     dst_rgb24 += twidth * 3;
3918     width -= twidth;
3919   }
3920 }
3921 #endif
3922 
3923 #if defined(HAS_I422TORGB565ROW_AVX2)
I422ToRGB565Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)3924 void I422ToRGB565Row_AVX2(const uint8_t* src_y,
3925                           const uint8_t* src_u,
3926                           const uint8_t* src_v,
3927                           uint8_t* dst_rgb565,
3928                           const struct YuvConstants* yuvconstants,
3929                           int width) {
3930   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3931   while (width > 0) {
3932     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3933     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
3934 #if defined(HAS_ARGBTORGB565ROW_AVX2)
3935     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
3936 #else
3937     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
3938 #endif
3939     src_y += twidth;
3940     src_u += twidth / 2;
3941     src_v += twidth / 2;
3942     dst_rgb565 += twidth * 2;
3943     width -= twidth;
3944   }
3945 }
3946 #endif
3947 
3948 #if defined(HAS_I422TOARGB1555ROW_AVX2)
I422ToARGB1555Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)3949 void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
3950                             const uint8_t* src_u,
3951                             const uint8_t* src_v,
3952                             uint8_t* dst_argb1555,
3953                             const struct YuvConstants* yuvconstants,
3954                             int width) {
3955   // Row buffer for intermediate ARGB pixels.
3956   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3957   while (width > 0) {
3958     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3959     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
3960 #if defined(HAS_ARGBTOARGB1555ROW_AVX2)
3961     ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
3962 #else
3963     ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
3964 #endif
3965     src_y += twidth;
3966     src_u += twidth / 2;
3967     src_v += twidth / 2;
3968     dst_argb1555 += twidth * 2;
3969     width -= twidth;
3970   }
3971 }
3972 #endif
3973 
3974 #if defined(HAS_I422TOARGB4444ROW_AVX2)
I422ToARGB4444Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)3975 void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
3976                             const uint8_t* src_u,
3977                             const uint8_t* src_v,
3978                             uint8_t* dst_argb4444,
3979                             const struct YuvConstants* yuvconstants,
3980                             int width) {
3981   // Row buffer for intermediate ARGB pixels.
3982   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3983   while (width > 0) {
3984     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3985     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
3986 #if defined(HAS_ARGBTOARGB4444ROW_AVX2)
3987     ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
3988 #else
3989     ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
3990 #endif
3991     src_y += twidth;
3992     src_u += twidth / 2;
3993     src_v += twidth / 2;
3994     dst_argb4444 += twidth * 2;
3995     width -= twidth;
3996   }
3997 }
3998 #endif
3999 
4000 #if defined(HAS_I422TORGB24ROW_AVX2)
I422ToRGB24Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)4001 void I422ToRGB24Row_AVX2(const uint8_t* src_y,
4002                          const uint8_t* src_u,
4003                          const uint8_t* src_v,
4004                          uint8_t* dst_rgb24,
4005                          const struct YuvConstants* yuvconstants,
4006                          int width) {
4007   // Row buffer for intermediate ARGB pixels.
4008   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4009   while (width > 0) {
4010     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4011     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
4012 #if defined(HAS_ARGBTORGB24ROW_AVX2)
4013     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
4014 #else
4015     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
4016 #endif
4017     src_y += twidth;
4018     src_u += twidth / 2;
4019     src_v += twidth / 2;
4020     dst_rgb24 += twidth * 3;
4021     width -= twidth;
4022   }
4023 }
4024 #endif
4025 
4026 #if defined(HAS_NV12TORGB565ROW_AVX2)
NV12ToRGB565Row_AVX2(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)4027 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
4028                           const uint8_t* src_uv,
4029                           uint8_t* dst_rgb565,
4030                           const struct YuvConstants* yuvconstants,
4031                           int width) {
4032   // Row buffer for intermediate ARGB pixels.
4033   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4034   while (width > 0) {
4035     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4036     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
4037 #if defined(HAS_ARGBTORGB565ROW_AVX2)
4038     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
4039 #else
4040     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
4041 #endif
4042     src_y += twidth;
4043     src_uv += twidth;
4044     dst_rgb565 += twidth * 2;
4045     width -= twidth;
4046   }
4047 }
4048 #endif
4049 
4050 #ifdef HAS_RGB24TOYJROW_AVX2
4051 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
RGB24ToYJRow_AVX2(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)4052 void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
4053   // Row buffer for intermediate ARGB pixels.
4054   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4055   while (width > 0) {
4056     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4057     RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
4058     ARGBToYJRow_AVX2(row, dst_yj, twidth);
4059     src_rgb24 += twidth * 3;
4060     dst_yj += twidth;
4061     width -= twidth;
4062   }
4063 }
4064 #endif  // HAS_RGB24TOYJROW_AVX2
4065 
4066 #ifdef HAS_RAWTOYJROW_AVX2
4067 // Convert 16 RAW pixels (64 bytes) to 16 YJ values.
RAWToYJRow_AVX2(const uint8_t * src_raw,uint8_t * dst_yj,int width)4068 void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
4069   // Row buffer for intermediate ARGB pixels.
4070   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4071   while (width > 0) {
4072     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4073     RAWToARGBRow_SSSE3(src_raw, row, twidth);
4074     ARGBToYJRow_AVX2(row, dst_yj, twidth);
4075     src_raw += twidth * 3;
4076     dst_yj += twidth;
4077     width -= twidth;
4078   }
4079 }
4080 #endif  // HAS_RAWTOYJROW_AVX2
4081 
4082 #ifdef HAS_RGB24TOYJROW_SSSE3
4083 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
RGB24ToYJRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)4084 void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
4085   // Row buffer for intermediate ARGB pixels.
4086   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4087   while (width > 0) {
4088     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4089     RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
4090     ARGBToYJRow_SSSE3(row, dst_yj, twidth);
4091     src_rgb24 += twidth * 3;
4092     dst_yj += twidth;
4093     width -= twidth;
4094   }
4095 }
4096 #endif  // HAS_RGB24TOYJROW_SSSE3
4097 
4098 #ifdef HAS_RAWTOYJROW_SSSE3
4099 // Convert 16 RAW pixels (64 bytes) to 16 YJ values.
RAWToYJRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_yj,int width)4100 void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
4101   // Row buffer for intermediate ARGB pixels.
4102   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4103   while (width > 0) {
4104     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4105     RAWToARGBRow_SSSE3(src_raw, row, twidth);
4106     ARGBToYJRow_SSSE3(row, dst_yj, twidth);
4107     src_raw += twidth * 3;
4108     dst_yj += twidth;
4109     width -= twidth;
4110   }
4111 }
4112 #endif  // HAS_RAWTOYJROW_SSSE3
4113 
4114 #ifdef HAS_INTERPOLATEROW_16TO8_AVX2
InterpolateRow_16To8_AVX2(uint8_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int scale,int width,int source_y_fraction)4115 void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
4116                                const uint16_t* src_ptr,
4117                                ptrdiff_t src_stride,
4118                                int scale,
4119                                int width,
4120                                int source_y_fraction) {
4121   // Row buffer for intermediate 16 bit pixels.
4122   SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
4123   while (width > 0) {
4124     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4125     InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
4126     Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
4127     src_ptr += twidth;
4128     dst_ptr += twidth;
4129     width -= twidth;
4130   }
4131 }
4132 #endif  // HAS_INTERPOLATEROW_16TO8_AVX2
4133 
ScaleSumSamples_C(const float * src,float * dst,float scale,int width)4134 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
4135   float fsum = 0.f;
4136   int i;
4137   for (i = 0; i < width; ++i) {
4138     float v = *src++;
4139     fsum += v * v;
4140     *dst++ = v * scale;
4141   }
4142   return fsum;
4143 }
4144 
ScaleMaxSamples_C(const float * src,float * dst,float scale,int width)4145 float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
4146   float fmax = 0.f;
4147   int i;
4148   for (i = 0; i < width; ++i) {
4149     float v = *src++;
4150     float vs = v * scale;
4151     fmax = (v > fmax) ? v : fmax;
4152     *dst++ = vs;
4153   }
4154   return fmax;
4155 }
4156 
ScaleSamples_C(const float * src,float * dst,float scale,int width)4157 void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
4158   int i;
4159   for (i = 0; i < width; ++i) {
4160     *dst++ = *src++ * scale;
4161   }
4162 }
4163 
GaussRow_C(const uint32_t * src,uint16_t * dst,int width)4164 void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
4165   int i;
4166   for (i = 0; i < width; ++i) {
4167     *dst++ =
4168         (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
4169     ++src;
4170   }
4171 }
4172 
4173 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_C(const uint16_t * src0,const uint16_t * src1,const uint16_t * src2,const uint16_t * src3,const uint16_t * src4,uint32_t * dst,int width)4174 void GaussCol_C(const uint16_t* src0,
4175                 const uint16_t* src1,
4176                 const uint16_t* src2,
4177                 const uint16_t* src3,
4178                 const uint16_t* src4,
4179                 uint32_t* dst,
4180                 int width) {
4181   int i;
4182   for (i = 0; i < width; ++i) {
4183     *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
4184   }
4185 }
4186 
GaussRow_F32_C(const float * src,float * dst,int width)4187 void GaussRow_F32_C(const float* src, float* dst, int width) {
4188   int i;
4189   for (i = 0; i < width; ++i) {
4190     *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
4191              (1.0f / 256.0f);
4192     ++src;
4193   }
4194 }
4195 
4196 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_F32_C(const float * src0,const float * src1,const float * src2,const float * src3,const float * src4,float * dst,int width)4197 void GaussCol_F32_C(const float* src0,
4198                     const float* src1,
4199                     const float* src2,
4200                     const float* src3,
4201                     const float* src4,
4202                     float* dst,
4203                     int width) {
4204   int i;
4205   for (i = 0; i < width; ++i) {
4206     *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
4207   }
4208 }
4209 
4210 // Convert biplanar NV21 to packed YUV24
NV21ToYUV24Row_C(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)4211 void NV21ToYUV24Row_C(const uint8_t* src_y,
4212                       const uint8_t* src_vu,
4213                       uint8_t* dst_yuv24,
4214                       int width) {
4215   int x;
4216   for (x = 0; x < width - 1; x += 2) {
4217     dst_yuv24[0] = src_vu[0];  // V
4218     dst_yuv24[1] = src_vu[1];  // U
4219     dst_yuv24[2] = src_y[0];   // Y0
4220     dst_yuv24[3] = src_vu[0];  // V
4221     dst_yuv24[4] = src_vu[1];  // U
4222     dst_yuv24[5] = src_y[1];   // Y1
4223     src_y += 2;
4224     src_vu += 2;
4225     dst_yuv24 += 6;  // Advance 2 pixels.
4226   }
4227   if (width & 1) {
4228     dst_yuv24[0] = src_vu[0];  // V
4229     dst_yuv24[1] = src_vu[1];  // U
4230     dst_yuv24[2] = src_y[0];   // Y0
4231   }
4232 }
4233 
4234 // Filter 2 rows of AYUV UV's (444) into UV (420).
4235 // AYUV is VUYA in memory.  UV for NV12 is UV order in memory.
AYUVToUVRow_C(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_uv,int width)4236 void AYUVToUVRow_C(const uint8_t* src_ayuv,
4237                    int src_stride_ayuv,
4238                    uint8_t* dst_uv,
4239                    int width) {
4240   // Output a row of UV values, filtering 2x2 rows of AYUV.
4241   int x;
4242   for (x = 0; x < width - 1; x += 2) {
4243     dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
4244                  src_ayuv[src_stride_ayuv + 5] + 2) >>
4245                 2;
4246     dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
4247                  src_ayuv[src_stride_ayuv + 4] + 2) >>
4248                 2;
4249     src_ayuv += 8;
4250     dst_uv += 2;
4251   }
4252   if (width & 1) {
4253     dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
4254     dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
4255   }
4256 }
4257 
4258 // Filter 2 rows of AYUV UV's (444) into VU (420).
AYUVToVURow_C(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_vu,int width)4259 void AYUVToVURow_C(const uint8_t* src_ayuv,
4260                    int src_stride_ayuv,
4261                    uint8_t* dst_vu,
4262                    int width) {
4263   // Output a row of VU values, filtering 2x2 rows of AYUV.
4264   int x;
4265   for (x = 0; x < width - 1; x += 2) {
4266     dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
4267                  src_ayuv[src_stride_ayuv + 4] + 2) >>
4268                 2;
4269     dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
4270                  src_ayuv[src_stride_ayuv + 5] + 2) >>
4271                 2;
4272     src_ayuv += 8;
4273     dst_vu += 2;
4274   }
4275   if (width & 1) {
4276     dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
4277     dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
4278   }
4279 }
4280 
4281 // Copy row of AYUV Y's into Y
AYUVToYRow_C(const uint8_t * src_ayuv,uint8_t * dst_y,int width)4282 void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
4283   // Output a row of Y values.
4284   int x;
4285   for (x = 0; x < width; ++x) {
4286     dst_y[x] = src_ayuv[2];  // v,u,y,a
4287     src_ayuv += 4;
4288   }
4289 }
4290 
4291 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_C(const uint8_t * src_uv,uint8_t * dst_vu,int width)4292 void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
4293   int x;
4294   for (x = 0; x < width; ++x) {
4295     uint8_t u = src_uv[0];
4296     uint8_t v = src_uv[1];
4297     dst_vu[0] = v;
4298     dst_vu[1] = u;
4299     src_uv += 2;
4300     dst_vu += 2;
4301   }
4302 }
4303 
HalfMergeUVRow_C(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)4304 void HalfMergeUVRow_C(const uint8_t* src_u,
4305                       int src_stride_u,
4306                       const uint8_t* src_v,
4307                       int src_stride_v,
4308                       uint8_t* dst_uv,
4309                       int width) {
4310   int x;
4311   for (x = 0; x < width - 1; x += 2) {
4312     dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
4313                  src_u[src_stride_u + 1] + 2) >>
4314                 2;
4315     dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
4316                  src_v[src_stride_v + 1] + 2) >>
4317                 2;
4318     src_u += 2;
4319     src_v += 2;
4320     dst_uv += 2;
4321   }
4322   if (width & 1) {
4323     dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
4324     dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
4325   }
4326 }
4327 
4328 #ifdef __cplusplus
4329 }  // extern "C"
4330 }  // namespace libyuv
4331 #endif
4332