• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #include <assert.h>
14 #include <string.h>  // For memcpy and memset.
15 
16 #include "libyuv/basic_types.h"
17 #include "libyuv/convert_argb.h"  // For kYuvI601Constants
18 
19 #ifdef __cplusplus
20 namespace libyuv {
21 extern "C" {
22 #endif
23 
24 // This macro controls YUV to RGB using unsigned math to extend range of
25 // YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
26 // LIBYUV_UNLIMITED_DATA
27 
28 // Macros to enable unlimited data for each colorspace
29 // LIBYUV_UNLIMITED_BT601
30 // LIBYUV_UNLIMITED_BT709
31 // LIBYUV_UNLIMITED_BT2020
32 
33 // The following macro from row_win makes the C code match the row_win code,
34 // which is 7 bit fixed point for ARGBToI420:
35 #if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \
36     defined(_MSC_VER) && !defined(__clang__) &&                   \
37     (defined(_M_IX86) || defined(_M_X64))
38 #define LIBYUV_RGB7 1
39 #endif
40 
41 #if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \
42                                    defined(__i386__) || defined(_M_IX86))
43 #define LIBYUV_ARGBTOUV_PAVGB 1
44 #define LIBYUV_RGBTOU_TRUNCATE 1
45 #define LIBYUV_ATTENUATE_DUP 1
46 #endif
47 #if defined(LIBYUV_BIT_EXACT)
48 #define LIBYUV_UNATTENUATE_DUP 1
49 #endif
50 
51 // llvm x86 is poor at ternary operator, so use branchless min/max.
52 
53 #define USE_BRANCHLESS 1
54 #if USE_BRANCHLESS
clamp0(int32_t v)55 static __inline int32_t clamp0(int32_t v) {
56   return -(v >= 0) & v;
57 }
58 // TODO(fbarchard): make clamp255 preserve negative values.
clamp255(int32_t v)59 static __inline int32_t clamp255(int32_t v) {
60   return (-(v >= 255) | v) & 255;
61 }
62 
clamp1023(int32_t v)63 static __inline int32_t clamp1023(int32_t v) {
64   return (-(v >= 1023) | v) & 1023;
65 }
66 
67 // clamp to max
ClampMax(int32_t v,int32_t max)68 static __inline int32_t ClampMax(int32_t v, int32_t max) {
69   return (-(v >= max) | v) & max;
70 }
71 
Abs(int32_t v)72 static __inline uint32_t Abs(int32_t v) {
73   int m = -(v < 0);
74   return (v + m) ^ m;
75 }
76 #else   // USE_BRANCHLESS
77 static __inline int32_t clamp0(int32_t v) {
78   return (v < 0) ? 0 : v;
79 }
80 
81 static __inline int32_t clamp255(int32_t v) {
82   return (v > 255) ? 255 : v;
83 }
84 
85 static __inline int32_t clamp1023(int32_t v) {
86   return (v > 1023) ? 1023 : v;
87 }
88 
89 static __inline int32_t ClampMax(int32_t v, int32_t max) {
90   return (v > max) ? max : v;
91 }
92 
93 static __inline uint32_t Abs(int32_t v) {
94   return (v < 0) ? -v : v;
95 }
96 #endif  // USE_BRANCHLESS
Clamp(int32_t val)97 static __inline uint32_t Clamp(int32_t val) {
98   int v = clamp0(val);
99   return (uint32_t)(clamp255(v));
100 }
101 
Clamp10(int32_t val)102 static __inline uint32_t Clamp10(int32_t val) {
103   int v = clamp0(val);
104   return (uint32_t)(clamp1023(v));
105 }
106 
107 // Little Endian
108 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
109     defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) ||     \
110     (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
111 #define WRITEWORD(p, v) *(uint32_t*)(p) = v
112 #else
WRITEWORD(uint8_t * p,uint32_t v)113 static inline void WRITEWORD(uint8_t* p, uint32_t v) {
114   p[0] = (uint8_t)(v & 255);
115   p[1] = (uint8_t)((v >> 8) & 255);
116   p[2] = (uint8_t)((v >> 16) & 255);
117   p[3] = (uint8_t)((v >> 24) & 255);
118 }
119 #endif
120 
RGB24ToARGBRow_C(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)121 void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
122   int x;
123   for (x = 0; x < width; ++x) {
124     uint8_t b = src_rgb24[0];
125     uint8_t g = src_rgb24[1];
126     uint8_t r = src_rgb24[2];
127     dst_argb[0] = b;
128     dst_argb[1] = g;
129     dst_argb[2] = r;
130     dst_argb[3] = 255u;
131     dst_argb += 4;
132     src_rgb24 += 3;
133   }
134 }
135 
RAWToARGBRow_C(const uint8_t * src_raw,uint8_t * dst_argb,int width)136 void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
137   int x;
138   for (x = 0; x < width; ++x) {
139     uint8_t r = src_raw[0];
140     uint8_t g = src_raw[1];
141     uint8_t b = src_raw[2];
142     dst_argb[0] = b;
143     dst_argb[1] = g;
144     dst_argb[2] = r;
145     dst_argb[3] = 255u;
146     dst_argb += 4;
147     src_raw += 3;
148   }
149 }
150 
RAWToRGBARow_C(const uint8_t * src_raw,uint8_t * dst_rgba,int width)151 void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
152   int x;
153   for (x = 0; x < width; ++x) {
154     uint8_t r = src_raw[0];
155     uint8_t g = src_raw[1];
156     uint8_t b = src_raw[2];
157     dst_rgba[0] = 255u;
158     dst_rgba[1] = b;
159     dst_rgba[2] = g;
160     dst_rgba[3] = r;
161     dst_rgba += 4;
162     src_raw += 3;
163   }
164 }
165 
RAWToRGB24Row_C(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)166 void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
167   int x;
168   for (x = 0; x < width; ++x) {
169     uint8_t r = src_raw[0];
170     uint8_t g = src_raw[1];
171     uint8_t b = src_raw[2];
172     dst_rgb24[0] = b;
173     dst_rgb24[1] = g;
174     dst_rgb24[2] = r;
175     dst_rgb24 += 3;
176     src_raw += 3;
177   }
178 }
179 
RGB565ToARGBRow_C(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)180 void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
181                        uint8_t* dst_argb,
182                        int width) {
183   int x;
184   for (x = 0; x < width; ++x) {
185     uint8_t b = src_rgb565[0] & 0x1f;
186     uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
187     uint8_t r = src_rgb565[1] >> 3;
188     dst_argb[0] = (b << 3) | (b >> 2);
189     dst_argb[1] = (g << 2) | (g >> 4);
190     dst_argb[2] = (r << 3) | (r >> 2);
191     dst_argb[3] = 255u;
192     dst_argb += 4;
193     src_rgb565 += 2;
194   }
195 }
196 
ARGB1555ToARGBRow_C(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)197 void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
198                          uint8_t* dst_argb,
199                          int width) {
200   int x;
201   for (x = 0; x < width; ++x) {
202     uint8_t b = src_argb1555[0] & 0x1f;
203     uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
204     uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
205     uint8_t a = src_argb1555[1] >> 7;
206     dst_argb[0] = (b << 3) | (b >> 2);
207     dst_argb[1] = (g << 3) | (g >> 2);
208     dst_argb[2] = (r << 3) | (r >> 2);
209     dst_argb[3] = -a;
210     dst_argb += 4;
211     src_argb1555 += 2;
212   }
213 }
214 
ARGB4444ToARGBRow_C(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)215 void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
216                          uint8_t* dst_argb,
217                          int width) {
218   int x;
219   for (x = 0; x < width; ++x) {
220     uint8_t b = src_argb4444[0] & 0x0f;
221     uint8_t g = src_argb4444[0] >> 4;
222     uint8_t r = src_argb4444[1] & 0x0f;
223     uint8_t a = src_argb4444[1] >> 4;
224     dst_argb[0] = (b << 4) | b;
225     dst_argb[1] = (g << 4) | g;
226     dst_argb[2] = (r << 4) | r;
227     dst_argb[3] = (a << 4) | a;
228     dst_argb += 4;
229     src_argb4444 += 2;
230   }
231 }
232 
AR30ToARGBRow_C(const uint8_t * src_ar30,uint8_t * dst_argb,int width)233 void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
234   int x;
235   for (x = 0; x < width; ++x) {
236     uint32_t ar30;
237     memcpy(&ar30, src_ar30, sizeof ar30);
238     uint32_t b = (ar30 >> 2) & 0xff;
239     uint32_t g = (ar30 >> 12) & 0xff;
240     uint32_t r = (ar30 >> 22) & 0xff;
241     uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
242     *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
243     dst_argb += 4;
244     src_ar30 += 4;
245   }
246 }
247 
AR30ToABGRRow_C(const uint8_t * src_ar30,uint8_t * dst_abgr,int width)248 void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
249   int x;
250   for (x = 0; x < width; ++x) {
251     uint32_t ar30;
252     memcpy(&ar30, src_ar30, sizeof ar30);
253     uint32_t b = (ar30 >> 2) & 0xff;
254     uint32_t g = (ar30 >> 12) & 0xff;
255     uint32_t r = (ar30 >> 22) & 0xff;
256     uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
257     *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
258     dst_abgr += 4;
259     src_ar30 += 4;
260   }
261 }
262 
AR30ToAB30Row_C(const uint8_t * src_ar30,uint8_t * dst_ab30,int width)263 void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
264   int x;
265   for (x = 0; x < width; ++x) {
266     uint32_t ar30;
267     memcpy(&ar30, src_ar30, sizeof ar30);
268     uint32_t b = ar30 & 0x3ff;
269     uint32_t ga = ar30 & 0xc00ffc00;
270     uint32_t r = (ar30 >> 20) & 0x3ff;
271     *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
272     dst_ab30 += 4;
273     src_ar30 += 4;
274   }
275 }
276 
ARGBToRGB24Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)277 void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
278   int x;
279   for (x = 0; x < width; ++x) {
280     uint8_t b = src_argb[0];
281     uint8_t g = src_argb[1];
282     uint8_t r = src_argb[2];
283     dst_rgb[0] = b;
284     dst_rgb[1] = g;
285     dst_rgb[2] = r;
286     dst_rgb += 3;
287     src_argb += 4;
288   }
289 }
290 
ARGBToRAWRow_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)291 void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
292   int x;
293   for (x = 0; x < width; ++x) {
294     uint8_t b = src_argb[0];
295     uint8_t g = src_argb[1];
296     uint8_t r = src_argb[2];
297     dst_rgb[0] = r;
298     dst_rgb[1] = g;
299     dst_rgb[2] = b;
300     dst_rgb += 3;
301     src_argb += 4;
302   }
303 }
304 
ARGBToRGB565Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)305 void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
306   int x;
307   for (x = 0; x < width - 1; x += 2) {
308     uint8_t b0 = src_argb[0] >> 3;
309     uint8_t g0 = src_argb[1] >> 2;
310     uint8_t r0 = src_argb[2] >> 3;
311     uint8_t b1 = src_argb[4] >> 3;
312     uint8_t g1 = src_argb[5] >> 2;
313     uint8_t r1 = src_argb[6] >> 3;
314     WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
315                            (r1 << 27));
316     dst_rgb += 4;
317     src_argb += 8;
318   }
319   if (width & 1) {
320     uint8_t b0 = src_argb[0] >> 3;
321     uint8_t g0 = src_argb[1] >> 2;
322     uint8_t r0 = src_argb[2] >> 3;
323     *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
324   }
325 }
326 
327 // dither4 is a row of 4 values from 4x4 dither matrix.
328 // The 4x4 matrix contains values to increase RGB.  When converting to
329 // fewer bits (565) this provides an ordered dither.
330 // The order in the 4x4 matrix in first byte is upper left.
331 // The 4 values are passed as an int, then referenced as an array, so
332 // endian will not affect order of the original matrix.  But the dither4
333 // will containing the first pixel in the lower byte for little endian
334 // or the upper byte for big endian.
ARGBToRGB565DitherRow_C(const uint8_t * src_argb,uint8_t * dst_rgb,const uint32_t dither4,int width)335 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
336                              uint8_t* dst_rgb,
337                              const uint32_t dither4,
338                              int width) {
339   int x;
340   for (x = 0; x < width - 1; x += 2) {
341     int dither0 = ((const unsigned char*)(&dither4))[x & 3];
342     int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
343     uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
344     uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
345     uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
346     uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
347     uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
348     uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
349     *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 11);
350     *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 11);
351     dst_rgb += 4;
352     src_argb += 8;
353   }
354   if (width & 1) {
355     int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
356     uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
357     uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
358     uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
359     *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
360   }
361 }
362 
ARGBToARGB1555Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)363 void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
364   int x;
365   for (x = 0; x < width - 1; x += 2) {
366     uint8_t b0 = src_argb[0] >> 3;
367     uint8_t g0 = src_argb[1] >> 3;
368     uint8_t r0 = src_argb[2] >> 3;
369     uint8_t a0 = src_argb[3] >> 7;
370     uint8_t b1 = src_argb[4] >> 3;
371     uint8_t g1 = src_argb[5] >> 3;
372     uint8_t r1 = src_argb[6] >> 3;
373     uint8_t a1 = src_argb[7] >> 7;
374     *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
375     *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 10) | (a1 << 15);
376     dst_rgb += 4;
377     src_argb += 8;
378   }
379   if (width & 1) {
380     uint8_t b0 = src_argb[0] >> 3;
381     uint8_t g0 = src_argb[1] >> 3;
382     uint8_t r0 = src_argb[2] >> 3;
383     uint8_t a0 = src_argb[3] >> 7;
384     *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
385   }
386 }
387 
ARGBToARGB4444Row_C(const uint8_t * src_argb,uint8_t * dst_rgb,int width)388 void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
389   int x;
390   for (x = 0; x < width - 1; x += 2) {
391     uint8_t b0 = src_argb[0] >> 4;
392     uint8_t g0 = src_argb[1] >> 4;
393     uint8_t r0 = src_argb[2] >> 4;
394     uint8_t a0 = src_argb[3] >> 4;
395     uint8_t b1 = src_argb[4] >> 4;
396     uint8_t g1 = src_argb[5] >> 4;
397     uint8_t r1 = src_argb[6] >> 4;
398     uint8_t a1 = src_argb[7] >> 4;
399     *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
400     *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 4) | (r1 << 8) | (a1 << 12);
401     dst_rgb += 4;
402     src_argb += 8;
403   }
404   if (width & 1) {
405     uint8_t b0 = src_argb[0] >> 4;
406     uint8_t g0 = src_argb[1] >> 4;
407     uint8_t r0 = src_argb[2] >> 4;
408     uint8_t a0 = src_argb[3] >> 4;
409     *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
410   }
411 }
412 
ABGRToAR30Row_C(const uint8_t * src_abgr,uint8_t * dst_ar30,int width)413 void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
414   int x;
415   for (x = 0; x < width; ++x) {
416     uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
417     uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
418     uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
419     uint32_t a0 = (src_abgr[3] >> 6);
420     *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
421     dst_ar30 += 4;
422     src_abgr += 4;
423   }
424 }
425 
ARGBToAR30Row_C(const uint8_t * src_argb,uint8_t * dst_ar30,int width)426 void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
427   int x;
428   for (x = 0; x < width; ++x) {
429     uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
430     uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
431     uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
432     uint32_t a0 = (src_argb[3] >> 6);
433     *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
434     dst_ar30 += 4;
435     src_argb += 4;
436   }
437 }
438 
ARGBToAR64Row_C(const uint8_t * src_argb,uint16_t * dst_ar64,int width)439 void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
440   int x;
441   for (x = 0; x < width; ++x) {
442     dst_ar64[0] = src_argb[0] * 0x0101;
443     dst_ar64[1] = src_argb[1] * 0x0101;
444     dst_ar64[2] = src_argb[2] * 0x0101;
445     dst_ar64[3] = src_argb[3] * 0x0101;
446     dst_ar64 += 4;
447     src_argb += 4;
448   }
449 }
450 
ARGBToAB64Row_C(const uint8_t * src_argb,uint16_t * dst_ab64,int width)451 void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
452   int x;
453   for (x = 0; x < width; ++x) {
454     dst_ab64[0] = src_argb[2] * 0x0101;
455     dst_ab64[1] = src_argb[1] * 0x0101;
456     dst_ab64[2] = src_argb[0] * 0x0101;
457     dst_ab64[3] = src_argb[3] * 0x0101;
458     dst_ab64 += 4;
459     src_argb += 4;
460   }
461 }
462 
AR64ToARGBRow_C(const uint16_t * src_ar64,uint8_t * dst_argb,int width)463 void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
464   int x;
465   for (x = 0; x < width; ++x) {
466     dst_argb[0] = src_ar64[0] >> 8;
467     dst_argb[1] = src_ar64[1] >> 8;
468     dst_argb[2] = src_ar64[2] >> 8;
469     dst_argb[3] = src_ar64[3] >> 8;
470     dst_argb += 4;
471     src_ar64 += 4;
472   }
473 }
474 
AB64ToARGBRow_C(const uint16_t * src_ab64,uint8_t * dst_argb,int width)475 void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
476   int x;
477   for (x = 0; x < width; ++x) {
478     dst_argb[0] = src_ab64[2] >> 8;
479     dst_argb[1] = src_ab64[1] >> 8;
480     dst_argb[2] = src_ab64[0] >> 8;
481     dst_argb[3] = src_ab64[3] >> 8;
482     dst_argb += 4;
483     src_ab64 += 4;
484   }
485 }
486 
487 // TODO(fbarchard): Make shuffle compatible with SIMD versions
AR64ShuffleRow_C(const uint8_t * src_ar64,uint8_t * dst_ar64,const uint8_t * shuffler,int width)488 void AR64ShuffleRow_C(const uint8_t* src_ar64,
489                       uint8_t* dst_ar64,
490                       const uint8_t* shuffler,
491                       int width) {
492   const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64;
493   uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64;
494   int index0 = shuffler[0] / 2;
495   int index1 = shuffler[2] / 2;
496   int index2 = shuffler[4] / 2;
497   int index3 = shuffler[6] / 2;
498   // Shuffle a row of AR64.
499   int x;
500   for (x = 0; x < width / 2; ++x) {
501     // To support in-place conversion.
502     uint16_t b = src_ar64_16[index0];
503     uint16_t g = src_ar64_16[index1];
504     uint16_t r = src_ar64_16[index2];
505     uint16_t a = src_ar64_16[index3];
506     dst_ar64_16[0] = b;
507     dst_ar64_16[1] = g;
508     dst_ar64_16[2] = r;
509     dst_ar64_16[3] = a;
510     src_ar64_16 += 4;
511     dst_ar64_16 += 4;
512   }
513 }
514 
515 #ifdef LIBYUV_RGB7
516 // Old 7 bit math for compatibility on unsupported platforms.
RGBToY(uint8_t r,uint8_t g,uint8_t b)517 static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
518   return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
519 }
520 #else
521 // 8 bit
522 // Intel SSE/AVX uses the following equivalent formula
523 // 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
524 //  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
525 //  0x7e80) >> 8;
526 
RGBToY(uint8_t r,uint8_t g,uint8_t b)527 static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
528   return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
529 }
530 #endif
531 
532 #define AVGB(a, b) (((a) + (b) + 1) >> 1)
533 
534 // LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round.
535 #ifdef LIBYUV_RGBTOU_TRUNCATE
RGBToU(uint8_t r,uint8_t g,uint8_t b)536 static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
537   return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
538 }
RGBToV(uint8_t r,uint8_t g,uint8_t b)539 static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
540   return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
541 }
542 #else
543 // TODO(fbarchard): Add rounding to x86 SIMD and use this
RGBToU(uint8_t r,uint8_t g,uint8_t b)544 static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
545   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
546 }
RGBToV(uint8_t r,uint8_t g,uint8_t b)547 static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
548   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
549 }
550 #endif
551 
552 // LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb.
553 #if !defined(LIBYUV_ARGBTOUV_PAVGB)
RGB2xToU(uint16_t r,uint16_t g,uint16_t b)554 static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
555   return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
556 }
RGB2xToV(uint16_t r,uint16_t g,uint16_t b)557 static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
558   return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
559 }
560 #endif
561 
562 // ARGBToY_C and ARGBToUV_C
563 // Intel version mimic SSE/AVX which does 2 pavgb
564 #if LIBYUV_ARGBTOUV_PAVGB
565 #define MAKEROWY(NAME, R, G, B, BPP)                                       \
566   void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
567     int x;                                                                 \
568     for (x = 0; x < width; ++x) {                                          \
569       dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
570       src_rgb += BPP;                                                      \
571       dst_y += 1;                                                          \
572     }                                                                      \
573   }                                                                        \
574   void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
575                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
576     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
577     int x;                                                                 \
578     for (x = 0; x < width - 1; x += 2) {                                   \
579       uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                     \
580                         AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));        \
581       uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                     \
582                         AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));        \
583       uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                     \
584                         AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));        \
585       dst_u[0] = RGBToU(ar, ag, ab);                                       \
586       dst_v[0] = RGBToV(ar, ag, ab);                                       \
587       src_rgb += BPP * 2;                                                  \
588       src_rgb1 += BPP * 2;                                                 \
589       dst_u += 1;                                                          \
590       dst_v += 1;                                                          \
591     }                                                                      \
592     if (width & 1) {                                                       \
593       uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                          \
594       uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                          \
595       uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                          \
596       dst_u[0] = RGBToU(ar, ag, ab);                                       \
597       dst_v[0] = RGBToV(ar, ag, ab);                                       \
598     }                                                                      \
599   }
600 #else
601 // ARM version does sum / 2 then multiply by 2x smaller coefficients
602 #define MAKEROWY(NAME, R, G, B, BPP)                                       \
603   void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
604     int x;                                                                 \
605     for (x = 0; x < width; ++x) {                                          \
606       dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
607       src_rgb += BPP;                                                      \
608       dst_y += 1;                                                          \
609     }                                                                      \
610   }                                                                        \
611   void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
612                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
613     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
614     int x;                                                                 \
615     for (x = 0; x < width - 1; x += 2) {                                   \
616       uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +         \
617                      src_rgb1[B + BPP] + 1) >>                             \
618                     1;                                                     \
619       uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +         \
620                      src_rgb1[G + BPP] + 1) >>                             \
621                     1;                                                     \
622       uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +         \
623                      src_rgb1[R + BPP] + 1) >>                             \
624                     1;                                                     \
625       dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
626       dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
627       src_rgb += BPP * 2;                                                  \
628       src_rgb1 += BPP * 2;                                                 \
629       dst_u += 1;                                                          \
630       dst_v += 1;                                                          \
631     }                                                                      \
632     if (width & 1) {                                                       \
633       uint16_t ab = src_rgb[B] + src_rgb1[B];                              \
634       uint16_t ag = src_rgb[G] + src_rgb1[G];                              \
635       uint16_t ar = src_rgb[R] + src_rgb1[R];                              \
636       dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
637       dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
638     }                                                                      \
639   }
640 #endif
641 
642 MAKEROWY(ARGB, 2, 1, 0, 4)
643 MAKEROWY(BGRA, 1, 2, 3, 4)
644 MAKEROWY(ABGR, 0, 1, 2, 4)
645 MAKEROWY(RGBA, 3, 2, 1, 4)
646 MAKEROWY(RGB24, 2, 1, 0, 3)
647 MAKEROWY(RAW, 0, 1, 2, 3)
648 #undef MAKEROWY
649 
650 // JPeg uses a variation on BT.601-1 full range
651 // y =  0.29900 * r + 0.58700 * g + 0.11400 * b
652 // u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
653 // v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
654 // BT.601 Mpeg range uses:
655 // b 0.1016 * 255 = 25.908 = 25
656 // g 0.5078 * 255 = 129.489 = 129
657 // r 0.2578 * 255 = 65.739 = 66
658 // JPeg 7 bit Y (deprecated)
659 // b 0.11400 * 128 = 14.592 = 15
660 // g 0.58700 * 128 = 75.136 = 75
661 // r 0.29900 * 128 = 38.272 = 38
662 // JPeg 8 bit Y:
663 // b 0.11400 * 256 = 29.184 = 29
664 // g 0.58700 * 256 = 150.272 = 150
665 // r 0.29900 * 256 = 76.544 = 77
666 // JPeg 8 bit U:
667 // b  0.50000 * 255 = 127.5 = 127
668 // g -0.33126 * 255 = -84.4713 = -84
669 // r -0.16874 * 255 = -43.0287 = -43
670 // JPeg 8 bit V:
671 // b -0.08131 * 255 = -20.73405 = -20
672 // g -0.41869 * 255 = -106.76595 = -107
673 // r  0.50000 * 255 = 127.5 = 127
674 
675 #ifdef LIBYUV_RGB7
676 // Old 7 bit math for compatibility on unsupported platforms.
RGBToYJ(uint8_t r,uint8_t g,uint8_t b)677 static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
678   return (38 * r + 75 * g + 15 * b + 64) >> 7;
679 }
680 #else
681 // 8 bit
682 static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
683   return (77 * r + 150 * g + 29 * b + 128) >> 8;
684 }
685 #endif
686 
687 #if defined(LIBYUV_ARGBTOUV_PAVGB)
RGBToUJ(uint8_t r,uint8_t g,uint8_t b)688 static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
689   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
690 }
RGBToVJ(uint8_t r,uint8_t g,uint8_t b)691 static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
692   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
693 }
694 #else
RGB2xToUJ(uint16_t r,uint16_t g,uint16_t b)695 static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
696   return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
697 }
RGB2xToVJ(uint16_t r,uint16_t g,uint16_t b)698 static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
699   return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
700 }
701 #endif
702 
703 // ARGBToYJ_C and ARGBToUVJ_C
704 // Intel version mimic SSE/AVX which does 2 pavgb
705 #if LIBYUV_ARGBTOUV_PAVGB
706 #define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
707   void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
708     int x;                                                                  \
709     for (x = 0; x < width; ++x) {                                           \
710       dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
711       src_rgb += BPP;                                                       \
712       dst_y += 1;                                                           \
713     }                                                                       \
714   }                                                                         \
715   void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
716                         uint8_t* dst_u, uint8_t* dst_v, int width) {        \
717     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
718     int x;                                                                  \
719     for (x = 0; x < width - 1; x += 2) {                                    \
720       uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                      \
721                         AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));         \
722       uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                      \
723                         AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));         \
724       uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                      \
725                         AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));         \
726       dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
727       dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
728       src_rgb += BPP * 2;                                                   \
729       src_rgb1 += BPP * 2;                                                  \
730       dst_u += 1;                                                           \
731       dst_v += 1;                                                           \
732     }                                                                       \
733     if (width & 1) {                                                        \
734       uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                           \
735       uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                           \
736       uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                           \
737       dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
738       dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
739     }                                                                       \
740   }
741 #else
742 // ARM version does sum / 2 then multiply by 2x smaller coefficients
743 #define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
744   void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
745     int x;                                                                  \
746     for (x = 0; x < width; ++x) {                                           \
747       dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
748       src_rgb += BPP;                                                       \
749       dst_y += 1;                                                           \
750     }                                                                       \
751   }                                                                         \
752   void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
753                         uint8_t* dst_u, uint8_t* dst_v, int width) {        \
754     const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
755     int x;                                                                  \
756     for (x = 0; x < width - 1; x += 2) {                                    \
757       uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +          \
758                      src_rgb1[B + BPP] + 1) >>                              \
759                     1;                                                      \
760       uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +          \
761                      src_rgb1[G + BPP] + 1) >>                              \
762                     1;                                                      \
763       uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +          \
764                      src_rgb1[R + BPP] + 1) >>                              \
765                     1;                                                      \
766       dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
767       dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
768       src_rgb += BPP * 2;                                                   \
769       src_rgb1 += BPP * 2;                                                  \
770       dst_u += 1;                                                           \
771       dst_v += 1;                                                           \
772     }                                                                       \
773     if (width & 1) {                                                        \
774       uint16_t ab = (src_rgb[B] + src_rgb1[B]);                             \
775       uint16_t ag = (src_rgb[G] + src_rgb1[G]);                             \
776       uint16_t ar = (src_rgb[R] + src_rgb1[R]);                             \
777       dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
778       dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
779     }                                                                       \
780   }
781 
782 #endif
783 
784 MAKEROWYJ(ARGB, 2, 1, 0, 4)
785 MAKEROWYJ(RGBA, 3, 2, 1, 4)
786 MAKEROWYJ(RGB24, 2, 1, 0, 3)
787 MAKEROWYJ(RAW, 0, 1, 2, 3)
788 #undef MAKEROWYJ
789 
RGB565ToYRow_C(const uint8_t * src_rgb565,uint8_t * dst_y,int width)790 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
791   int x;
792   for (x = 0; x < width; ++x) {
793     uint8_t b = src_rgb565[0] & 0x1f;
794     uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
795     uint8_t r = src_rgb565[1] >> 3;
796     b = (b << 3) | (b >> 2);
797     g = (g << 2) | (g >> 4);
798     r = (r << 3) | (r >> 2);
799     dst_y[0] = RGBToY(r, g, b);
800     src_rgb565 += 2;
801     dst_y += 1;
802   }
803 }
804 
ARGB1555ToYRow_C(const uint8_t * src_argb1555,uint8_t * dst_y,int width)805 void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
806   int x;
807   for (x = 0; x < width; ++x) {
808     uint8_t b = src_argb1555[0] & 0x1f;
809     uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
810     uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
811     b = (b << 3) | (b >> 2);
812     g = (g << 3) | (g >> 2);
813     r = (r << 3) | (r >> 2);
814     dst_y[0] = RGBToY(r, g, b);
815     src_argb1555 += 2;
816     dst_y += 1;
817   }
818 }
819 
ARGB4444ToYRow_C(const uint8_t * src_argb4444,uint8_t * dst_y,int width)820 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
821   int x;
822   for (x = 0; x < width; ++x) {
823     uint8_t b = src_argb4444[0] & 0x0f;
824     uint8_t g = src_argb4444[0] >> 4;
825     uint8_t r = src_argb4444[1] & 0x0f;
826     b = (b << 4) | b;
827     g = (g << 4) | g;
828     r = (r << 4) | r;
829     dst_y[0] = RGBToY(r, g, b);
830     src_argb4444 += 2;
831     dst_y += 1;
832   }
833 }
834 
RGB565ToUVRow_C(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)835 void RGB565ToUVRow_C(const uint8_t* src_rgb565,
836                      int src_stride_rgb565,
837                      uint8_t* dst_u,
838                      uint8_t* dst_v,
839                      int width) {
840   const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
841   int x;
842   for (x = 0; x < width - 1; x += 2) {
843     uint8_t b0 = src_rgb565[0] & 0x1f;
844     uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
845     uint8_t r0 = src_rgb565[1] >> 3;
846     uint8_t b1 = src_rgb565[2] & 0x1f;
847     uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
848     uint8_t r1 = src_rgb565[3] >> 3;
849     uint8_t b2 = next_rgb565[0] & 0x1f;
850     uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
851     uint8_t r2 = next_rgb565[1] >> 3;
852     uint8_t b3 = next_rgb565[2] & 0x1f;
853     uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
854     uint8_t r3 = next_rgb565[3] >> 3;
855 
856     b0 = (b0 << 3) | (b0 >> 2);
857     g0 = (g0 << 2) | (g0 >> 4);
858     r0 = (r0 << 3) | (r0 >> 2);
859     b1 = (b1 << 3) | (b1 >> 2);
860     g1 = (g1 << 2) | (g1 >> 4);
861     r1 = (r1 << 3) | (r1 >> 2);
862     b2 = (b2 << 3) | (b2 >> 2);
863     g2 = (g2 << 2) | (g2 >> 4);
864     r2 = (r2 << 3) | (r2 >> 2);
865     b3 = (b3 << 3) | (b3 >> 2);
866     g3 = (g3 << 2) | (g3 >> 4);
867     r3 = (r3 << 3) | (r3 >> 2);
868 
869 #if LIBYUV_ARGBTOUV_PAVGB
870     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
871     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
872     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
873     dst_u[0] = RGBToU(ar, ag, ab);
874     dst_v[0] = RGBToV(ar, ag, ab);
875 #else
876     uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
877     uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
878     uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
879     dst_u[0] = RGB2xToU(r, g, b);
880     dst_v[0] = RGB2xToV(r, g, b);
881 #endif
882 
883     src_rgb565 += 4;
884     next_rgb565 += 4;
885     dst_u += 1;
886     dst_v += 1;
887   }
888   if (width & 1) {
889     uint8_t b0 = src_rgb565[0] & 0x1f;
890     uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
891     uint8_t r0 = src_rgb565[1] >> 3;
892     uint8_t b2 = next_rgb565[0] & 0x1f;
893     uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
894     uint8_t r2 = next_rgb565[1] >> 3;
895 
896     b0 = (b0 << 3) | (b0 >> 2);
897     g0 = (g0 << 2) | (g0 >> 4);
898     r0 = (r0 << 3) | (r0 >> 2);
899     b2 = (b2 << 3) | (b2 >> 2);
900     g2 = (g2 << 2) | (g2 >> 4);
901     r2 = (r2 << 3) | (r2 >> 2);
902 
903 #if LIBYUV_ARGBTOUV_PAVGB
904     uint8_t ab = AVGB(b0, b2);
905     uint8_t ag = AVGB(g0, g2);
906     uint8_t ar = AVGB(r0, r2);
907     dst_u[0] = RGBToU(ar, ag, ab);
908     dst_v[0] = RGBToV(ar, ag, ab);
909 #else
910     uint16_t b = b0 + b2;
911     uint16_t g = g0 + g2;
912     uint16_t r = r0 + r2;
913     dst_u[0] = RGB2xToU(r, g, b);
914     dst_v[0] = RGB2xToV(r, g, b);
915 #endif
916   }
917 }
918 
ARGB1555ToUVRow_C(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)919 void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
920                        int src_stride_argb1555,
921                        uint8_t* dst_u,
922                        uint8_t* dst_v,
923                        int width) {
924   const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
925   int x;
926   for (x = 0; x < width - 1; x += 2) {
927     uint8_t b0 = src_argb1555[0] & 0x1f;
928     uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
929     uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
930     uint8_t b1 = src_argb1555[2] & 0x1f;
931     uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
932     uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
933     uint8_t b2 = next_argb1555[0] & 0x1f;
934     uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
935     uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
936     uint8_t b3 = next_argb1555[2] & 0x1f;
937     uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
938     uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
939 
940     b0 = (b0 << 3) | (b0 >> 2);
941     g0 = (g0 << 3) | (g0 >> 2);
942     r0 = (r0 << 3) | (r0 >> 2);
943     b1 = (b1 << 3) | (b1 >> 2);
944     g1 = (g1 << 3) | (g1 >> 2);
945     r1 = (r1 << 3) | (r1 >> 2);
946     b2 = (b2 << 3) | (b2 >> 2);
947     g2 = (g2 << 3) | (g2 >> 2);
948     r2 = (r2 << 3) | (r2 >> 2);
949     b3 = (b3 << 3) | (b3 >> 2);
950     g3 = (g3 << 3) | (g3 >> 2);
951     r3 = (r3 << 3) | (r3 >> 2);
952 
953 #if LIBYUV_ARGBTOUV_PAVGB
954     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
955     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
956     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
957     dst_u[0] = RGBToU(ar, ag, ab);
958     dst_v[0] = RGBToV(ar, ag, ab);
959 #else
960     uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
961     uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
962     uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
963     dst_u[0] = RGB2xToU(r, g, b);
964     dst_v[0] = RGB2xToV(r, g, b);
965 #endif
966 
967     src_argb1555 += 4;
968     next_argb1555 += 4;
969     dst_u += 1;
970     dst_v += 1;
971   }
972   if (width & 1) {
973     uint8_t b0 = src_argb1555[0] & 0x1f;
974     uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
975     uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
976     uint8_t b2 = next_argb1555[0] & 0x1f;
977     uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
978     uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
979 
980     b0 = (b0 << 3) | (b0 >> 2);
981     g0 = (g0 << 3) | (g0 >> 2);
982     r0 = (r0 << 3) | (r0 >> 2);
983     b2 = (b2 << 3) | (b2 >> 2);
984     g2 = (g2 << 3) | (g2 >> 2);
985     r2 = (r2 << 3) | (r2 >> 2);
986 
987 #if LIBYUV_ARGBTOUV_PAVGB
988     uint8_t ab = AVGB(b0, b2);
989     uint8_t ag = AVGB(g0, g2);
990     uint8_t ar = AVGB(r0, r2);
991     dst_u[0] = RGBToU(ar, ag, ab);
992     dst_v[0] = RGBToV(ar, ag, ab);
993 #else
994     uint16_t b = b0 + b2;
995     uint16_t g = g0 + g2;
996     uint16_t r = r0 + r2;
997     dst_u[0] = RGB2xToU(r, g, b);
998     dst_v[0] = RGB2xToV(r, g, b);
999 #endif
1000   }
1001 }
1002 
ARGB4444ToUVRow_C(const uint8_t * src_argb4444,int src_stride_argb4444,uint8_t * dst_u,uint8_t * dst_v,int width)1003 void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
1004                        int src_stride_argb4444,
1005                        uint8_t* dst_u,
1006                        uint8_t* dst_v,
1007                        int width) {
1008   const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;
1009   int x;
1010   for (x = 0; x < width - 1; x += 2) {
1011     uint8_t b0 = src_argb4444[0] & 0x0f;
1012     uint8_t g0 = src_argb4444[0] >> 4;
1013     uint8_t r0 = src_argb4444[1] & 0x0f;
1014     uint8_t b1 = src_argb4444[2] & 0x0f;
1015     uint8_t g1 = src_argb4444[2] >> 4;
1016     uint8_t r1 = src_argb4444[3] & 0x0f;
1017     uint8_t b2 = next_argb4444[0] & 0x0f;
1018     uint8_t g2 = next_argb4444[0] >> 4;
1019     uint8_t r2 = next_argb4444[1] & 0x0f;
1020     uint8_t b3 = next_argb4444[2] & 0x0f;
1021     uint8_t g3 = next_argb4444[2] >> 4;
1022     uint8_t r3 = next_argb4444[3] & 0x0f;
1023 
1024     b0 = (b0 << 4) | b0;
1025     g0 = (g0 << 4) | g0;
1026     r0 = (r0 << 4) | r0;
1027     b1 = (b1 << 4) | b1;
1028     g1 = (g1 << 4) | g1;
1029     r1 = (r1 << 4) | r1;
1030     b2 = (b2 << 4) | b2;
1031     g2 = (g2 << 4) | g2;
1032     r2 = (r2 << 4) | r2;
1033     b3 = (b3 << 4) | b3;
1034     g3 = (g3 << 4) | g3;
1035     r3 = (r3 << 4) | r3;
1036 
1037 #if LIBYUV_ARGBTOUV_PAVGB
1038     uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
1039     uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
1040     uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
1041     dst_u[0] = RGBToU(ar, ag, ab);
1042     dst_v[0] = RGBToV(ar, ag, ab);
1043 #else
1044     uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
1045     uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
1046     uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
1047     dst_u[0] = RGB2xToU(r, g, b);
1048     dst_v[0] = RGB2xToV(r, g, b);
1049 #endif
1050 
1051     src_argb4444 += 4;
1052     next_argb4444 += 4;
1053     dst_u += 1;
1054     dst_v += 1;
1055   }
1056   if (width & 1) {
1057     uint8_t b0 = src_argb4444[0] & 0x0f;
1058     uint8_t g0 = src_argb4444[0] >> 4;
1059     uint8_t r0 = src_argb4444[1] & 0x0f;
1060     uint8_t b2 = next_argb4444[0] & 0x0f;
1061     uint8_t g2 = next_argb4444[0] >> 4;
1062     uint8_t r2 = next_argb4444[1] & 0x0f;
1063 
1064     b0 = (b0 << 4) | b0;
1065     g0 = (g0 << 4) | g0;
1066     r0 = (r0 << 4) | r0;
1067     b2 = (b2 << 4) | b2;
1068     g2 = (g2 << 4) | g2;
1069     r2 = (r2 << 4) | r2;
1070 
1071 #if LIBYUV_ARGBTOUV_PAVGB
1072     uint8_t ab = AVGB(b0, b2);
1073     uint8_t ag = AVGB(g0, g2);
1074     uint8_t ar = AVGB(r0, r2);
1075     dst_u[0] = RGBToU(ar, ag, ab);
1076     dst_v[0] = RGBToV(ar, ag, ab);
1077 #else
1078     uint16_t b = b0 + b2;
1079     uint16_t g = g0 + g2;
1080     uint16_t r = r0 + r2;
1081     dst_u[0] = RGB2xToU(r, g, b);
1082     dst_v[0] = RGB2xToV(r, g, b);
1083 #endif
1084   }
1085 }
1086 
ARGBToUV444Row_C(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1087 void ARGBToUV444Row_C(const uint8_t* src_argb,
1088                       uint8_t* dst_u,
1089                       uint8_t* dst_v,
1090                       int width) {
1091   int x;
1092   for (x = 0; x < width; ++x) {
1093     uint8_t ab = src_argb[0];
1094     uint8_t ag = src_argb[1];
1095     uint8_t ar = src_argb[2];
1096     dst_u[0] = RGBToU(ar, ag, ab);
1097     dst_v[0] = RGBToV(ar, ag, ab);
1098     src_argb += 4;
1099     dst_u += 1;
1100     dst_v += 1;
1101   }
1102 }
1103 
ARGBGrayRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width)1104 void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1105   int x;
1106   for (x = 0; x < width; ++x) {
1107     uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
1108     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
1109     dst_argb[3] = src_argb[3];
1110     dst_argb += 4;
1111     src_argb += 4;
1112   }
1113 }
1114 
1115 // Convert a row of image to Sepia tone.
ARGBSepiaRow_C(uint8_t * dst_argb,int width)1116 void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
1117   int x;
1118   for (x = 0; x < width; ++x) {
1119     int b = dst_argb[0];
1120     int g = dst_argb[1];
1121     int r = dst_argb[2];
1122     int sb = (b * 17 + g * 68 + r * 35) >> 7;
1123     int sg = (b * 22 + g * 88 + r * 45) >> 7;
1124     int sr = (b * 24 + g * 98 + r * 50) >> 7;
1125     // b does not over flow. a is preserved from original.
1126     dst_argb[0] = sb;
1127     dst_argb[1] = clamp255(sg);
1128     dst_argb[2] = clamp255(sr);
1129     dst_argb += 4;
1130   }
1131 }
1132 
1133 // Apply color matrix to a row of image. Matrix is signed.
1134 // TODO(fbarchard): Consider adding rounding (+32).
ARGBColorMatrixRow_C(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)1135 void ARGBColorMatrixRow_C(const uint8_t* src_argb,
1136                           uint8_t* dst_argb,
1137                           const int8_t* matrix_argb,
1138                           int width) {
1139   int x;
1140   for (x = 0; x < width; ++x) {
1141     int b = src_argb[0];
1142     int g = src_argb[1];
1143     int r = src_argb[2];
1144     int a = src_argb[3];
1145     int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
1146               a * matrix_argb[3]) >>
1147              6;
1148     int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
1149               a * matrix_argb[7]) >>
1150              6;
1151     int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
1152               a * matrix_argb[11]) >>
1153              6;
1154     int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
1155               a * matrix_argb[15]) >>
1156              6;
1157     dst_argb[0] = Clamp(sb);
1158     dst_argb[1] = Clamp(sg);
1159     dst_argb[2] = Clamp(sr);
1160     dst_argb[3] = Clamp(sa);
1161     src_argb += 4;
1162     dst_argb += 4;
1163   }
1164 }
1165 
1166 // Apply color table to a row of image.
ARGBColorTableRow_C(uint8_t * dst_argb,const uint8_t * table_argb,int width)1167 void ARGBColorTableRow_C(uint8_t* dst_argb,
1168                          const uint8_t* table_argb,
1169                          int width) {
1170   int x;
1171   for (x = 0; x < width; ++x) {
1172     int b = dst_argb[0];
1173     int g = dst_argb[1];
1174     int r = dst_argb[2];
1175     int a = dst_argb[3];
1176     dst_argb[0] = table_argb[b * 4 + 0];
1177     dst_argb[1] = table_argb[g * 4 + 1];
1178     dst_argb[2] = table_argb[r * 4 + 2];
1179     dst_argb[3] = table_argb[a * 4 + 3];
1180     dst_argb += 4;
1181   }
1182 }
1183 
1184 // Apply color table to a row of image.
RGBColorTableRow_C(uint8_t * dst_argb,const uint8_t * table_argb,int width)1185 void RGBColorTableRow_C(uint8_t* dst_argb,
1186                         const uint8_t* table_argb,
1187                         int width) {
1188   int x;
1189   for (x = 0; x < width; ++x) {
1190     int b = dst_argb[0];
1191     int g = dst_argb[1];
1192     int r = dst_argb[2];
1193     dst_argb[0] = table_argb[b * 4 + 0];
1194     dst_argb[1] = table_argb[g * 4 + 1];
1195     dst_argb[2] = table_argb[r * 4 + 2];
1196     dst_argb += 4;
1197   }
1198 }
1199 
ARGBQuantizeRow_C(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)1200 void ARGBQuantizeRow_C(uint8_t* dst_argb,
1201                        int scale,
1202                        int interval_size,
1203                        int interval_offset,
1204                        int width) {
1205   int x;
1206   for (x = 0; x < width; ++x) {
1207     int b = dst_argb[0];
1208     int g = dst_argb[1];
1209     int r = dst_argb[2];
1210     dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
1211     dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
1212     dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
1213     dst_argb += 4;
1214   }
1215 }
1216 
1217 #define REPEAT8(v) (v) | ((v) << 8)
1218 #define SHADE(f, v) v* f >> 24
1219 
ARGBShadeRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)1220 void ARGBShadeRow_C(const uint8_t* src_argb,
1221                     uint8_t* dst_argb,
1222                     int width,
1223                     uint32_t value) {
1224   const uint32_t b_scale = REPEAT8(value & 0xff);
1225   const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);
1226   const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);
1227   const uint32_t a_scale = REPEAT8(value >> 24);
1228 
1229   int i;
1230   for (i = 0; i < width; ++i) {
1231     const uint32_t b = REPEAT8(src_argb[0]);
1232     const uint32_t g = REPEAT8(src_argb[1]);
1233     const uint32_t r = REPEAT8(src_argb[2]);
1234     const uint32_t a = REPEAT8(src_argb[3]);
1235     dst_argb[0] = SHADE(b, b_scale);
1236     dst_argb[1] = SHADE(g, g_scale);
1237     dst_argb[2] = SHADE(r, r_scale);
1238     dst_argb[3] = SHADE(a, a_scale);
1239     src_argb += 4;
1240     dst_argb += 4;
1241   }
1242 }
1243 #undef REPEAT8
1244 #undef SHADE
1245 
1246 #define REPEAT8(v) (v) | ((v) << 8)
1247 #define SHADE(f, v) v* f >> 16
1248 
ARGBMultiplyRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1249 void ARGBMultiplyRow_C(const uint8_t* src_argb,
1250                        const uint8_t* src_argb1,
1251                        uint8_t* dst_argb,
1252                        int width) {
1253   int i;
1254   for (i = 0; i < width; ++i) {
1255     const uint32_t b = REPEAT8(src_argb[0]);
1256     const uint32_t g = REPEAT8(src_argb[1]);
1257     const uint32_t r = REPEAT8(src_argb[2]);
1258     const uint32_t a = REPEAT8(src_argb[3]);
1259     const uint32_t b_scale = src_argb1[0];
1260     const uint32_t g_scale = src_argb1[1];
1261     const uint32_t r_scale = src_argb1[2];
1262     const uint32_t a_scale = src_argb1[3];
1263     dst_argb[0] = SHADE(b, b_scale);
1264     dst_argb[1] = SHADE(g, g_scale);
1265     dst_argb[2] = SHADE(r, r_scale);
1266     dst_argb[3] = SHADE(a, a_scale);
1267     src_argb += 4;
1268     src_argb1 += 4;
1269     dst_argb += 4;
1270   }
1271 }
1272 #undef REPEAT8
1273 #undef SHADE
1274 
1275 #define SHADE(f, v) clamp255(v + f)
1276 
ARGBAddRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1277 void ARGBAddRow_C(const uint8_t* src_argb,
1278                   const uint8_t* src_argb1,
1279                   uint8_t* dst_argb,
1280                   int width) {
1281   int i;
1282   for (i = 0; i < width; ++i) {
1283     const int b = src_argb[0];
1284     const int g = src_argb[1];
1285     const int r = src_argb[2];
1286     const int a = src_argb[3];
1287     const int b_add = src_argb1[0];
1288     const int g_add = src_argb1[1];
1289     const int r_add = src_argb1[2];
1290     const int a_add = src_argb1[3];
1291     dst_argb[0] = SHADE(b, b_add);
1292     dst_argb[1] = SHADE(g, g_add);
1293     dst_argb[2] = SHADE(r, r_add);
1294     dst_argb[3] = SHADE(a, a_add);
1295     src_argb += 4;
1296     src_argb1 += 4;
1297     dst_argb += 4;
1298   }
1299 }
1300 #undef SHADE
1301 
1302 #define SHADE(f, v) clamp0(f - v)
1303 
ARGBSubtractRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1304 void ARGBSubtractRow_C(const uint8_t* src_argb,
1305                        const uint8_t* src_argb1,
1306                        uint8_t* dst_argb,
1307                        int width) {
1308   int i;
1309   for (i = 0; i < width; ++i) {
1310     const int b = src_argb[0];
1311     const int g = src_argb[1];
1312     const int r = src_argb[2];
1313     const int a = src_argb[3];
1314     const int b_sub = src_argb1[0];
1315     const int g_sub = src_argb1[1];
1316     const int r_sub = src_argb1[2];
1317     const int a_sub = src_argb1[3];
1318     dst_argb[0] = SHADE(b, b_sub);
1319     dst_argb[1] = SHADE(g, g_sub);
1320     dst_argb[2] = SHADE(r, r_sub);
1321     dst_argb[3] = SHADE(a, a_sub);
1322     src_argb += 4;
1323     src_argb1 += 4;
1324     dst_argb += 4;
1325   }
1326 }
1327 #undef SHADE
1328 
1329 // Sobel functions which mimics SSSE3.
SobelXRow_C(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)1330 void SobelXRow_C(const uint8_t* src_y0,
1331                  const uint8_t* src_y1,
1332                  const uint8_t* src_y2,
1333                  uint8_t* dst_sobelx,
1334                  int width) {
1335   int i;
1336   for (i = 0; i < width; ++i) {
1337     int a = src_y0[i];
1338     int b = src_y1[i];
1339     int c = src_y2[i];
1340     int a_sub = src_y0[i + 2];
1341     int b_sub = src_y1[i + 2];
1342     int c_sub = src_y2[i + 2];
1343     int a_diff = a - a_sub;
1344     int b_diff = b - b_sub;
1345     int c_diff = c - c_sub;
1346     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
1347     dst_sobelx[i] = (uint8_t)(clamp255(sobel));
1348   }
1349 }
1350 
SobelYRow_C(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)1351 void SobelYRow_C(const uint8_t* src_y0,
1352                  const uint8_t* src_y1,
1353                  uint8_t* dst_sobely,
1354                  int width) {
1355   int i;
1356   for (i = 0; i < width; ++i) {
1357     int a = src_y0[i + 0];
1358     int b = src_y0[i + 1];
1359     int c = src_y0[i + 2];
1360     int a_sub = src_y1[i + 0];
1361     int b_sub = src_y1[i + 1];
1362     int c_sub = src_y1[i + 2];
1363     int a_diff = a - a_sub;
1364     int b_diff = b - b_sub;
1365     int c_diff = c - c_sub;
1366     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
1367     dst_sobely[i] = (uint8_t)(clamp255(sobel));
1368   }
1369 }
1370 
SobelRow_C(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)1371 void SobelRow_C(const uint8_t* src_sobelx,
1372                 const uint8_t* src_sobely,
1373                 uint8_t* dst_argb,
1374                 int width) {
1375   int i;
1376   for (i = 0; i < width; ++i) {
1377     int r = src_sobelx[i];
1378     int b = src_sobely[i];
1379     int s = clamp255(r + b);
1380     dst_argb[0] = (uint8_t)(s);
1381     dst_argb[1] = (uint8_t)(s);
1382     dst_argb[2] = (uint8_t)(s);
1383     dst_argb[3] = (uint8_t)(255u);
1384     dst_argb += 4;
1385   }
1386 }
1387 
SobelToPlaneRow_C(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)1388 void SobelToPlaneRow_C(const uint8_t* src_sobelx,
1389                        const uint8_t* src_sobely,
1390                        uint8_t* dst_y,
1391                        int width) {
1392   int i;
1393   for (i = 0; i < width; ++i) {
1394     int r = src_sobelx[i];
1395     int b = src_sobely[i];
1396     int s = clamp255(r + b);
1397     dst_y[i] = (uint8_t)(s);
1398   }
1399 }
1400 
SobelXYRow_C(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)1401 void SobelXYRow_C(const uint8_t* src_sobelx,
1402                   const uint8_t* src_sobely,
1403                   uint8_t* dst_argb,
1404                   int width) {
1405   int i;
1406   for (i = 0; i < width; ++i) {
1407     int r = src_sobelx[i];
1408     int b = src_sobely[i];
1409     int g = clamp255(r + b);
1410     dst_argb[0] = (uint8_t)(b);
1411     dst_argb[1] = (uint8_t)(g);
1412     dst_argb[2] = (uint8_t)(r);
1413     dst_argb[3] = (uint8_t)(255u);
1414     dst_argb += 4;
1415   }
1416 }
1417 
J400ToARGBRow_C(const uint8_t * src_y,uint8_t * dst_argb,int width)1418 void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
1419   // Copy a Y to RGB.
1420   int x;
1421   for (x = 0; x < width; ++x) {
1422     uint8_t y = src_y[0];
1423     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
1424     dst_argb[3] = 255u;
1425     dst_argb += 4;
1426     ++src_y;
1427   }
1428 }
1429 
1430 // Macros to create SIMD specific yuv to rgb conversion constants.
1431 
1432 // clang-format off
1433 
1434 #if defined(__aarch64__) || defined(__arm__)
1435 // Bias values include subtract 128 from U and V, bias from Y and rounding.
1436 // For B and R bias is negative. For G bias is positive.
1437 #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
1438   {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},                     \
1439    {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
1440     0, 0}}
1441 #else
1442 #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
1443   {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
1444     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
1445    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
1446     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
1447    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
1448     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
1449    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
1450    {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
1451 #endif
1452 
1453 // clang-format on
1454 
1455 #define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
1456   const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
1457       YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
1458   const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
1459       YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
1460 
1461 // TODO(fbarchard): Generate SIMD structures from float matrix.
1462 
1463 // BT.601 limited range YUV to RGB reference
1464 //  R = (Y - 16) * 1.164             + V * 1.596
1465 //  G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
1466 //  B = (Y - 16) * 1.164 + U * 2.018
1467 // KR = 0.299; KB = 0.114
1468 
1469 // U and V contributions to R,G,B.
1470 #if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601)
1471 #define UB 129 /* round(2.018 * 64) */
1472 #else
1473 #define UB 128 /* max(128, round(2.018 * 64)) */
1474 #endif
1475 #define UG 25  /* round(0.391 * 64) */
1476 #define VG 52  /* round(0.813 * 64) */
1477 #define VR 102 /* round(1.596 * 64) */
1478 
1479 // Y contribution to R,G,B.  Scale and bias.
1480 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1481 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1482 
MAKEYUVCONSTANTS(I601,YG,YB,UB,UG,VG,VR)1483 MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
1484 
1485 #undef YG
1486 #undef YB
1487 #undef UB
1488 #undef UG
1489 #undef VG
1490 #undef VR
1491 
1492 // BT.601 full range YUV to RGB reference (aka JPEG)
1493 // *  R = Y               + V * 1.40200
1494 // *  G = Y - U * 0.34414 - V * 0.71414
1495 // *  B = Y + U * 1.77200
1496 // KR = 0.299; KB = 0.114
1497 
1498 // U and V contributions to R,G,B.
1499 #define UB 113 /* round(1.77200 * 64) */
1500 #define UG 22  /* round(0.34414 * 64) */
1501 #define VG 46  /* round(0.71414 * 64) */
1502 #define VR 90  /* round(1.40200 * 64) */
1503 
1504 // Y contribution to R,G,B.  Scale and bias.
1505 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1506 #define YB 32    /* 64 / 2 */
1507 
1508 MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
1509 
1510 #undef YG
1511 #undef YB
1512 #undef UB
1513 #undef UG
1514 #undef VG
1515 #undef VR
1516 
1517 // BT.709 limited range YUV to RGB reference
1518 //  R = (Y - 16) * 1.164             + V * 1.793
1519 //  G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
1520 //  B = (Y - 16) * 1.164 + U * 2.112
1521 //  KR = 0.2126, KB = 0.0722
1522 
1523 // U and V contributions to R,G,B.
1524 #if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709)
1525 #define UB 135 /* round(2.112 * 64) */
1526 #else
1527 #define UB 128 /* max(128, round(2.112 * 64)) */
1528 #endif
1529 #define UG 14  /* round(0.213 * 64) */
1530 #define VG 34  /* round(0.533 * 64) */
1531 #define VR 115 /* round(1.793 * 64) */
1532 
1533 // Y contribution to R,G,B.  Scale and bias.
1534 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1535 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1536 
1537 MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
1538 
1539 #undef YG
1540 #undef YB
1541 #undef UB
1542 #undef UG
1543 #undef VG
1544 #undef VR
1545 
1546 // BT.709 full range YUV to RGB reference
1547 //  R = Y               + V * 1.5748
1548 //  G = Y - U * 0.18732 - V * 0.46812
1549 //  B = Y + U * 1.8556
1550 //  KR = 0.2126, KB = 0.0722
1551 
1552 // U and V contributions to R,G,B.
1553 #define UB 119 /* round(1.8556 * 64) */
1554 #define UG 12  /* round(0.18732 * 64) */
1555 #define VG 30  /* round(0.46812 * 64) */
1556 #define VR 101 /* round(1.5748 * 64) */
1557 
1558 // Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
1559 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
1560 #define YB 32    /* 64 / 2 */
1561 
1562 MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
1563 
1564 #undef YG
1565 #undef YB
1566 #undef UB
1567 #undef UG
1568 #undef VG
1569 #undef VR
1570 
1571 // BT.2020 limited range YUV to RGB reference
1572 //  R = (Y - 16) * 1.164384                + V * 1.67867
1573 //  G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
1574 //  B = (Y - 16) * 1.164384 + U * 2.14177
1575 // KR = 0.2627; KB = 0.0593
1576 
1577 // U and V contributions to R,G,B.
1578 #if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020)
1579 #define UB 137 /* round(2.142 * 64) */
1580 #else
1581 #define UB 128 /* max(128, round(2.142 * 64)) */
1582 #endif
1583 #define UG 12  /* round(0.187326 * 64) */
1584 #define VG 42  /* round(0.65042 * 64) */
1585 #define VR 107 /* round(1.67867 * 64) */
1586 
1587 // Y contribution to R,G,B.  Scale and bias.
1588 #define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
1589 #define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
1590 
1591 MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
1592 
1593 #undef YG
1594 #undef YB
1595 #undef UB
1596 #undef UG
1597 #undef VG
1598 #undef VR
1599 
1600 // BT.2020 full range YUV to RGB reference
1601 //  R = Y                + V * 1.474600
1602 //  G = Y - U * 0.164553 - V * 0.571353
1603 //  B = Y + U * 1.881400
1604 // KR = 0.2627; KB = 0.0593
1605 
1606 #define UB 120 /* round(1.881400 * 64) */
1607 #define UG 11  /* round(0.164553 * 64) */
1608 #define VG 37  /* round(0.571353 * 64) */
1609 #define VR 94  /* round(1.474600 * 64) */
1610 
1611 // Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
1612 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
1613 #define YB 32    /* 64 / 2 */
1614 
1615 MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
1616 
1617 #undef YG
1618 #undef YB
1619 #undef UB
1620 #undef UG
1621 #undef VG
1622 #undef VR
1623 
1624 #undef BB
1625 #undef BG
1626 #undef BR
1627 
1628 #undef MAKEYUVCONSTANTS
1629 
1630 #if defined(__aarch64__) || defined(__arm__)
1631 #define LOAD_YUV_CONSTANTS                 \
1632   int ub = yuvconstants->kUVCoeff[0];      \
1633   int vr = yuvconstants->kUVCoeff[1];      \
1634   int ug = yuvconstants->kUVCoeff[2];      \
1635   int vg = yuvconstants->kUVCoeff[3];      \
1636   int yg = yuvconstants->kRGBCoeffBias[0]; \
1637   int bb = yuvconstants->kRGBCoeffBias[1]; \
1638   int bg = yuvconstants->kRGBCoeffBias[2]; \
1639   int br = yuvconstants->kRGBCoeffBias[3]
1640 
1641 #define CALC_RGB16                         \
1642   int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
1643   int b16 = y1 + (u * ub) - bb;            \
1644   int g16 = y1 + bg - (u * ug + v * vg);   \
1645   int r16 = y1 + (v * vr) - br
1646 #else
1647 #define LOAD_YUV_CONSTANTS           \
1648   int ub = yuvconstants->kUVToB[0];  \
1649   int ug = yuvconstants->kUVToG[0];  \
1650   int vg = yuvconstants->kUVToG[1];  \
1651   int vr = yuvconstants->kUVToR[1];  \
1652   int yg = yuvconstants->kYToRgb[0]; \
1653   int yb = yuvconstants->kYBiasToRgb[0]
1654 
1655 #define CALC_RGB16                                \
1656   int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
1657   int8_t ui = u;                                  \
1658   int8_t vi = v;                                  \
1659   ui -= 0x80;                                     \
1660   vi -= 0x80;                                     \
1661   int b16 = y1 + (ui * ub);                       \
1662   int g16 = y1 - (ui * ug + vi * vg);             \
1663   int r16 = y1 + (vi * vr)
1664 #endif
1665 
1666 // C reference code that mimics the YUV assembly.
1667 // Reads 8 bit YUV and leaves result as 16 bit.
1668 static __inline void YuvPixel(uint8_t y,
1669                               uint8_t u,
1670                               uint8_t v,
1671                               uint8_t* b,
1672                               uint8_t* g,
1673                               uint8_t* r,
1674                               const struct YuvConstants* yuvconstants) {
1675   LOAD_YUV_CONSTANTS;
1676   uint32_t y32 = y * 0x0101;
1677   CALC_RGB16;
1678   *b = Clamp((int32_t)(b16) >> 6);
1679   *g = Clamp((int32_t)(g16) >> 6);
1680   *r = Clamp((int32_t)(r16) >> 6);
1681 }
1682 
1683 // Reads 8 bit YUV and leaves result as 16 bit.
YuvPixel8_16(uint8_t y,uint8_t u,uint8_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1684 static __inline void YuvPixel8_16(uint8_t y,
1685                                   uint8_t u,
1686                                   uint8_t v,
1687                                   int* b,
1688                                   int* g,
1689                                   int* r,
1690                                   const struct YuvConstants* yuvconstants) {
1691   LOAD_YUV_CONSTANTS;
1692   uint32_t y32 = y * 0x0101;
1693   CALC_RGB16;
1694   *b = b16;
1695   *g = g16;
1696   *r = r16;
1697 }
1698 
1699 // C reference code that mimics the YUV 16 bit assembly.
1700 // Reads 10 bit YUV and leaves result as 16 bit.
YuvPixel10_16(uint16_t y,uint16_t u,uint16_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1701 static __inline void YuvPixel10_16(uint16_t y,
1702                                    uint16_t u,
1703                                    uint16_t v,
1704                                    int* b,
1705                                    int* g,
1706                                    int* r,
1707                                    const struct YuvConstants* yuvconstants) {
1708   LOAD_YUV_CONSTANTS;
1709   uint32_t y32 = y << 6;
1710   u = clamp255(u >> 2);
1711   v = clamp255(v >> 2);
1712   CALC_RGB16;
1713   *b = b16;
1714   *g = g16;
1715   *r = r16;
1716 }
1717 
1718 // C reference code that mimics the YUV 16 bit assembly.
1719 // Reads 12 bit YUV and leaves result as 16 bit.
YuvPixel12_16(int16_t y,int16_t u,int16_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1720 static __inline void YuvPixel12_16(int16_t y,
1721                                    int16_t u,
1722                                    int16_t v,
1723                                    int* b,
1724                                    int* g,
1725                                    int* r,
1726                                    const struct YuvConstants* yuvconstants) {
1727   LOAD_YUV_CONSTANTS;
1728   uint32_t y32 = y << 4;
1729   u = clamp255(u >> 4);
1730   v = clamp255(v >> 4);
1731   CALC_RGB16;
1732   *b = b16;
1733   *g = g16;
1734   *r = r16;
1735 }
1736 
1737 // C reference code that mimics the YUV 10 bit assembly.
1738 // Reads 10 bit YUV and clamps down to 8 bit RGB.
YuvPixel10(uint16_t y,uint16_t u,uint16_t v,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1739 static __inline void YuvPixel10(uint16_t y,
1740                                 uint16_t u,
1741                                 uint16_t v,
1742                                 uint8_t* b,
1743                                 uint8_t* g,
1744                                 uint8_t* r,
1745                                 const struct YuvConstants* yuvconstants) {
1746   int b16;
1747   int g16;
1748   int r16;
1749   YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
1750   *b = Clamp(b16 >> 6);
1751   *g = Clamp(g16 >> 6);
1752   *r = Clamp(r16 >> 6);
1753 }
1754 
1755 // C reference code that mimics the YUV 12 bit assembly.
1756 // Reads 12 bit YUV and clamps down to 8 bit RGB.
YuvPixel12(uint16_t y,uint16_t u,uint16_t v,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1757 static __inline void YuvPixel12(uint16_t y,
1758                                 uint16_t u,
1759                                 uint16_t v,
1760                                 uint8_t* b,
1761                                 uint8_t* g,
1762                                 uint8_t* r,
1763                                 const struct YuvConstants* yuvconstants) {
1764   int b16;
1765   int g16;
1766   int r16;
1767   YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
1768   *b = Clamp(b16 >> 6);
1769   *g = Clamp(g16 >> 6);
1770   *r = Clamp(r16 >> 6);
1771 }
1772 
1773 // C reference code that mimics the YUV 16 bit assembly.
1774 // Reads 16 bit YUV and leaves result as 8 bit.
YuvPixel16_8(uint16_t y,uint16_t u,uint16_t v,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1775 static __inline void YuvPixel16_8(uint16_t y,
1776                                   uint16_t u,
1777                                   uint16_t v,
1778                                   uint8_t* b,
1779                                   uint8_t* g,
1780                                   uint8_t* r,
1781                                   const struct YuvConstants* yuvconstants) {
1782   LOAD_YUV_CONSTANTS;
1783   uint32_t y32 = y;
1784   u = clamp255(u >> 8);
1785   v = clamp255(v >> 8);
1786   CALC_RGB16;
1787   *b = Clamp((int32_t)(b16) >> 6);
1788   *g = Clamp((int32_t)(g16) >> 6);
1789   *r = Clamp((int32_t)(r16) >> 6);
1790 }
1791 
1792 // C reference code that mimics the YUV 16 bit assembly.
1793 // Reads 16 bit YUV and leaves result as 16 bit.
YuvPixel16_16(uint16_t y,uint16_t u,uint16_t v,int * b,int * g,int * r,const struct YuvConstants * yuvconstants)1794 static __inline void YuvPixel16_16(uint16_t y,
1795                                    uint16_t u,
1796                                    uint16_t v,
1797                                    int* b,
1798                                    int* g,
1799                                    int* r,
1800                                    const struct YuvConstants* yuvconstants) {
1801   LOAD_YUV_CONSTANTS;
1802   uint32_t y32 = y;
1803   u = clamp255(u >> 8);
1804   v = clamp255(v >> 8);
1805   CALC_RGB16;
1806   *b = b16;
1807   *g = g16;
1808   *r = r16;
1809 }
1810 
1811 // C reference code that mimics the YUV assembly.
1812 // Reads 8 bit YUV and leaves result as 8 bit.
YPixel(uint8_t y,uint8_t * b,uint8_t * g,uint8_t * r,const struct YuvConstants * yuvconstants)1813 static __inline void YPixel(uint8_t y,
1814                             uint8_t* b,
1815                             uint8_t* g,
1816                             uint8_t* r,
1817                             const struct YuvConstants* yuvconstants) {
1818 #if defined(__aarch64__) || defined(__arm__)
1819   int yg = yuvconstants->kRGBCoeffBias[0];
1820   int ygb = yuvconstants->kRGBCoeffBias[4];
1821 #else
1822   int ygb = yuvconstants->kYBiasToRgb[0];
1823   int yg = yuvconstants->kYToRgb[0];
1824 #endif
1825   uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
1826   *b = Clamp(((int32_t)(y1) + ygb) >> 6);
1827   *g = Clamp(((int32_t)(y1) + ygb) >> 6);
1828   *r = Clamp(((int32_t)(y1) + ygb) >> 6);
1829 }
1830 
I444ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1831 void I444ToARGBRow_C(const uint8_t* src_y,
1832                      const uint8_t* src_u,
1833                      const uint8_t* src_v,
1834                      uint8_t* rgb_buf,
1835                      const struct YuvConstants* yuvconstants,
1836                      int width) {
1837   int x;
1838   for (x = 0; x < width; ++x) {
1839     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1840              rgb_buf + 2, yuvconstants);
1841     rgb_buf[3] = 255;
1842     src_y += 1;
1843     src_u += 1;
1844     src_v += 1;
1845     rgb_buf += 4;  // Advance 1 pixel.
1846   }
1847 }
1848 
1849 // Also used for 420
I422ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1850 void I422ToARGBRow_C(const uint8_t* src_y,
1851                      const uint8_t* src_u,
1852                      const uint8_t* src_v,
1853                      uint8_t* rgb_buf,
1854                      const struct YuvConstants* yuvconstants,
1855                      int width) {
1856   int x;
1857   for (x = 0; x < width - 1; x += 2) {
1858     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1859              rgb_buf + 2, yuvconstants);
1860     rgb_buf[3] = 255;
1861     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1862              rgb_buf + 6, yuvconstants);
1863     rgb_buf[7] = 255;
1864     src_y += 2;
1865     src_u += 1;
1866     src_v += 1;
1867     rgb_buf += 8;  // Advance 2 pixels.
1868   }
1869   if (width & 1) {
1870     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1871              rgb_buf + 2, yuvconstants);
1872     rgb_buf[3] = 255;
1873   }
1874 }
1875 
1876 // 10 bit YUV to ARGB
I210ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1877 void I210ToARGBRow_C(const uint16_t* src_y,
1878                      const uint16_t* src_u,
1879                      const uint16_t* src_v,
1880                      uint8_t* rgb_buf,
1881                      const struct YuvConstants* yuvconstants,
1882                      int width) {
1883   int x;
1884   for (x = 0; x < width - 1; x += 2) {
1885     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1886                rgb_buf + 2, yuvconstants);
1887     rgb_buf[3] = 255;
1888     YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1889                rgb_buf + 6, yuvconstants);
1890     rgb_buf[7] = 255;
1891     src_y += 2;
1892     src_u += 1;
1893     src_v += 1;
1894     rgb_buf += 8;  // Advance 2 pixels.
1895   }
1896   if (width & 1) {
1897     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1898                rgb_buf + 2, yuvconstants);
1899     rgb_buf[3] = 255;
1900   }
1901 }
1902 
I410ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1903 void I410ToARGBRow_C(const uint16_t* src_y,
1904                      const uint16_t* src_u,
1905                      const uint16_t* src_v,
1906                      uint8_t* rgb_buf,
1907                      const struct YuvConstants* yuvconstants,
1908                      int width) {
1909   int x;
1910   for (x = 0; x < width; ++x) {
1911     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1912                rgb_buf + 2, yuvconstants);
1913     rgb_buf[3] = 255;
1914     src_y += 1;
1915     src_u += 1;
1916     src_v += 1;
1917     rgb_buf += 4;  // Advance 1 pixels.
1918   }
1919 }
1920 
I210AlphaToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,const uint16_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1921 void I210AlphaToARGBRow_C(const uint16_t* src_y,
1922                           const uint16_t* src_u,
1923                           const uint16_t* src_v,
1924                           const uint16_t* src_a,
1925                           uint8_t* rgb_buf,
1926                           const struct YuvConstants* yuvconstants,
1927                           int width) {
1928   int x;
1929   for (x = 0; x < width - 1; x += 2) {
1930     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1931                rgb_buf + 2, yuvconstants);
1932     rgb_buf[3] = clamp255(src_a[0] >> 2);
1933     YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1934                rgb_buf + 6, yuvconstants);
1935     rgb_buf[7] = clamp255(src_a[1] >> 2);
1936     src_y += 2;
1937     src_u += 1;
1938     src_v += 1;
1939     src_a += 2;
1940     rgb_buf += 8;  // Advance 2 pixels.
1941   }
1942   if (width & 1) {
1943     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1944                rgb_buf + 2, yuvconstants);
1945     rgb_buf[3] = clamp255(src_a[0] >> 2);
1946   }
1947 }
1948 
I410AlphaToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,const uint16_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1949 void I410AlphaToARGBRow_C(const uint16_t* src_y,
1950                           const uint16_t* src_u,
1951                           const uint16_t* src_v,
1952                           const uint16_t* src_a,
1953                           uint8_t* rgb_buf,
1954                           const struct YuvConstants* yuvconstants,
1955                           int width) {
1956   int x;
1957   for (x = 0; x < width; ++x) {
1958     YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1959                rgb_buf + 2, yuvconstants);
1960     rgb_buf[3] = clamp255(src_a[0] >> 2);
1961     src_y += 1;
1962     src_u += 1;
1963     src_v += 1;
1964     src_a += 1;
1965     rgb_buf += 4;  // Advance 1 pixels.
1966   }
1967 }
1968 
1969 // 12 bit YUV to ARGB
I212ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)1970 void I212ToARGBRow_C(const uint16_t* src_y,
1971                      const uint16_t* src_u,
1972                      const uint16_t* src_v,
1973                      uint8_t* rgb_buf,
1974                      const struct YuvConstants* yuvconstants,
1975                      int width) {
1976   int x;
1977   for (x = 0; x < width - 1; x += 2) {
1978     YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1979                rgb_buf + 2, yuvconstants);
1980     rgb_buf[3] = 255;
1981     YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
1982                rgb_buf + 6, yuvconstants);
1983     rgb_buf[7] = 255;
1984     src_y += 2;
1985     src_u += 1;
1986     src_v += 1;
1987     rgb_buf += 8;  // Advance 2 pixels.
1988   }
1989   if (width & 1) {
1990     YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
1991                rgb_buf + 2, yuvconstants);
1992     rgb_buf[3] = 255;
1993   }
1994 }
1995 
StoreAR30(uint8_t * rgb_buf,int b,int g,int r)1996 static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
1997   uint32_t ar30;
1998   b = b >> 4;  // convert 8 bit 10.6 to 10 bit.
1999   g = g >> 4;
2000   r = r >> 4;
2001   b = Clamp10(b);
2002   g = Clamp10(g);
2003   r = Clamp10(r);
2004   ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;
2005   (*(uint32_t*)rgb_buf) = ar30;
2006 }
2007 
2008 // 10 bit YUV to 10 bit AR30
I210ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2009 void I210ToAR30Row_C(const uint16_t* src_y,
2010                      const uint16_t* src_u,
2011                      const uint16_t* src_v,
2012                      uint8_t* rgb_buf,
2013                      const struct YuvConstants* yuvconstants,
2014                      int width) {
2015   int x;
2016   int b;
2017   int g;
2018   int r;
2019   for (x = 0; x < width - 1; x += 2) {
2020     YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2021     StoreAR30(rgb_buf, b, g, r);
2022     YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2023     StoreAR30(rgb_buf + 4, b, g, r);
2024     src_y += 2;
2025     src_u += 1;
2026     src_v += 1;
2027     rgb_buf += 8;  // Advance 2 pixels.
2028   }
2029   if (width & 1) {
2030     YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2031     StoreAR30(rgb_buf, b, g, r);
2032   }
2033 }
2034 
2035 // 12 bit YUV to 10 bit AR30
I212ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2036 void I212ToAR30Row_C(const uint16_t* src_y,
2037                      const uint16_t* src_u,
2038                      const uint16_t* src_v,
2039                      uint8_t* rgb_buf,
2040                      const struct YuvConstants* yuvconstants,
2041                      int width) {
2042   int x;
2043   int b;
2044   int g;
2045   int r;
2046   for (x = 0; x < width - 1; x += 2) {
2047     YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2048     StoreAR30(rgb_buf, b, g, r);
2049     YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2050     StoreAR30(rgb_buf + 4, b, g, r);
2051     src_y += 2;
2052     src_u += 1;
2053     src_v += 1;
2054     rgb_buf += 8;  // Advance 2 pixels.
2055   }
2056   if (width & 1) {
2057     YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2058     StoreAR30(rgb_buf, b, g, r);
2059   }
2060 }
2061 
I410ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_u,const uint16_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2062 void I410ToAR30Row_C(const uint16_t* src_y,
2063                      const uint16_t* src_u,
2064                      const uint16_t* src_v,
2065                      uint8_t* rgb_buf,
2066                      const struct YuvConstants* yuvconstants,
2067                      int width) {
2068   int x;
2069   int b;
2070   int g;
2071   int r;
2072   for (x = 0; x < width; ++x) {
2073     YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2074     StoreAR30(rgb_buf, b, g, r);
2075     src_y += 1;
2076     src_u += 1;
2077     src_v += 1;
2078     rgb_buf += 4;  // Advance 1 pixel.
2079   }
2080 }
2081 
2082 // P210 has 10 bits in msb of 16 bit NV12 style layout.
P210ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2083 void P210ToARGBRow_C(const uint16_t* src_y,
2084                      const uint16_t* src_uv,
2085                      uint8_t* dst_argb,
2086                      const struct YuvConstants* yuvconstants,
2087                      int width) {
2088   int x;
2089   for (x = 0; x < width - 1; x += 2) {
2090     YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
2091                  dst_argb + 2, yuvconstants);
2092     dst_argb[3] = 255;
2093     YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5,
2094                  dst_argb + 6, yuvconstants);
2095     dst_argb[7] = 255;
2096     src_y += 2;
2097     src_uv += 2;
2098     dst_argb += 8;  // Advance 2 pixels.
2099   }
2100   if (width & 1) {
2101     YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
2102                  dst_argb + 2, yuvconstants);
2103     dst_argb[3] = 255;
2104   }
2105 }
2106 
P410ToARGBRow_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2107 void P410ToARGBRow_C(const uint16_t* src_y,
2108                      const uint16_t* src_uv,
2109                      uint8_t* dst_argb,
2110                      const struct YuvConstants* yuvconstants,
2111                      int width) {
2112   int x;
2113   for (x = 0; x < width; ++x) {
2114     YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
2115                  dst_argb + 2, yuvconstants);
2116     dst_argb[3] = 255;
2117     src_y += 1;
2118     src_uv += 2;
2119     dst_argb += 4;  // Advance 1 pixels.
2120   }
2121 }
2122 
P210ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2123 void P210ToAR30Row_C(const uint16_t* src_y,
2124                      const uint16_t* src_uv,
2125                      uint8_t* dst_ar30,
2126                      const struct YuvConstants* yuvconstants,
2127                      int width) {
2128   int x;
2129   int b;
2130   int g;
2131   int r;
2132   for (x = 0; x < width - 1; x += 2) {
2133     YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2134     StoreAR30(dst_ar30, b, g, r);
2135     YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2136     StoreAR30(dst_ar30 + 4, b, g, r);
2137     src_y += 2;
2138     src_uv += 2;
2139     dst_ar30 += 8;  // Advance 2 pixels.
2140   }
2141   if (width & 1) {
2142     YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2143     StoreAR30(dst_ar30, b, g, r);
2144   }
2145 }
2146 
P410ToAR30Row_C(const uint16_t * src_y,const uint16_t * src_uv,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2147 void P410ToAR30Row_C(const uint16_t* src_y,
2148                      const uint16_t* src_uv,
2149                      uint8_t* dst_ar30,
2150                      const struct YuvConstants* yuvconstants,
2151                      int width) {
2152   int x;
2153   int b;
2154   int g;
2155   int r;
2156   for (x = 0; x < width; ++x) {
2157     YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
2158     StoreAR30(dst_ar30, b, g, r);
2159     src_y += 1;
2160     src_uv += 2;
2161     dst_ar30 += 4;  // Advance 1 pixel.
2162   }
2163 }
2164 
2165 // 8 bit YUV to 10 bit AR30
2166 // Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
I422ToAR30Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2167 void I422ToAR30Row_C(const uint8_t* src_y,
2168                      const uint8_t* src_u,
2169                      const uint8_t* src_v,
2170                      uint8_t* rgb_buf,
2171                      const struct YuvConstants* yuvconstants,
2172                      int width) {
2173   int x;
2174   int b;
2175   int g;
2176   int r;
2177   for (x = 0; x < width - 1; x += 2) {
2178     YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2179     StoreAR30(rgb_buf, b, g, r);
2180     YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2181     StoreAR30(rgb_buf + 4, b, g, r);
2182     src_y += 2;
2183     src_u += 1;
2184     src_v += 1;
2185     rgb_buf += 8;  // Advance 2 pixels.
2186   }
2187   if (width & 1) {
2188     YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
2189     StoreAR30(rgb_buf, b, g, r);
2190   }
2191 }
2192 
I444AlphaToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2193 void I444AlphaToARGBRow_C(const uint8_t* src_y,
2194                           const uint8_t* src_u,
2195                           const uint8_t* src_v,
2196                           const uint8_t* src_a,
2197                           uint8_t* rgb_buf,
2198                           const struct YuvConstants* yuvconstants,
2199                           int width) {
2200   int x;
2201   for (x = 0; x < width; ++x) {
2202     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2203              rgb_buf + 2, yuvconstants);
2204     rgb_buf[3] = src_a[0];
2205     src_y += 1;
2206     src_u += 1;
2207     src_v += 1;
2208     src_a += 1;
2209     rgb_buf += 4;  // Advance 1 pixel.
2210   }
2211 }
2212 
I422AlphaToARGBRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2213 void I422AlphaToARGBRow_C(const uint8_t* src_y,
2214                           const uint8_t* src_u,
2215                           const uint8_t* src_v,
2216                           const uint8_t* src_a,
2217                           uint8_t* rgb_buf,
2218                           const struct YuvConstants* yuvconstants,
2219                           int width) {
2220   int x;
2221   for (x = 0; x < width - 1; x += 2) {
2222     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2223              rgb_buf + 2, yuvconstants);
2224     rgb_buf[3] = src_a[0];
2225     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
2226              rgb_buf + 6, yuvconstants);
2227     rgb_buf[7] = src_a[1];
2228     src_y += 2;
2229     src_u += 1;
2230     src_v += 1;
2231     src_a += 2;
2232     rgb_buf += 8;  // Advance 2 pixels.
2233   }
2234   if (width & 1) {
2235     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2236              rgb_buf + 2, yuvconstants);
2237     rgb_buf[3] = src_a[0];
2238   }
2239 }
2240 
I422ToRGB24Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2241 void I422ToRGB24Row_C(const uint8_t* src_y,
2242                       const uint8_t* src_u,
2243                       const uint8_t* src_v,
2244                       uint8_t* rgb_buf,
2245                       const struct YuvConstants* yuvconstants,
2246                       int width) {
2247   int x;
2248   for (x = 0; x < width - 1; x += 2) {
2249     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2250              rgb_buf + 2, yuvconstants);
2251     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
2252              rgb_buf + 5, yuvconstants);
2253     src_y += 2;
2254     src_u += 1;
2255     src_v += 1;
2256     rgb_buf += 6;  // Advance 2 pixels.
2257   }
2258   if (width & 1) {
2259     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
2260              rgb_buf + 2, yuvconstants);
2261   }
2262 }
2263 
I422ToARGB4444Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)2264 void I422ToARGB4444Row_C(const uint8_t* src_y,
2265                          const uint8_t* src_u,
2266                          const uint8_t* src_v,
2267                          uint8_t* dst_argb4444,
2268                          const struct YuvConstants* yuvconstants,
2269                          int width) {
2270   uint8_t b0;
2271   uint8_t g0;
2272   uint8_t r0;
2273   uint8_t b1;
2274   uint8_t g1;
2275   uint8_t r1;
2276   int x;
2277   for (x = 0; x < width - 1; x += 2) {
2278     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2279     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
2280     b0 = b0 >> 4;
2281     g0 = g0 >> 4;
2282     r0 = r0 >> 4;
2283     b1 = b1 >> 4;
2284     g1 = g1 >> 4;
2285     r1 = r1 >> 4;
2286     *(uint16_t*)(dst_argb4444 + 0) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
2287     *(uint16_t*)(dst_argb4444 + 2) = b1 | (g1 << 4) | (r1 << 8) | 0xf000;
2288     src_y += 2;
2289     src_u += 1;
2290     src_v += 1;
2291     dst_argb4444 += 4;  // Advance 2 pixels.
2292   }
2293   if (width & 1) {
2294     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2295     b0 = b0 >> 4;
2296     g0 = g0 >> 4;
2297     r0 = r0 >> 4;
2298     *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
2299   }
2300 }
2301 
I422ToARGB1555Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)2302 void I422ToARGB1555Row_C(const uint8_t* src_y,
2303                          const uint8_t* src_u,
2304                          const uint8_t* src_v,
2305                          uint8_t* dst_argb1555,
2306                          const struct YuvConstants* yuvconstants,
2307                          int width) {
2308   uint8_t b0;
2309   uint8_t g0;
2310   uint8_t r0;
2311   uint8_t b1;
2312   uint8_t g1;
2313   uint8_t r1;
2314   int x;
2315   for (x = 0; x < width - 1; x += 2) {
2316     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2317     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
2318     b0 = b0 >> 3;
2319     g0 = g0 >> 3;
2320     r0 = r0 >> 3;
2321     b1 = b1 >> 3;
2322     g1 = g1 >> 3;
2323     r1 = r1 >> 3;
2324     *(uint16_t*)(dst_argb1555 + 0) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
2325     *(uint16_t*)(dst_argb1555 + 2) = b1 | (g1 << 5) | (r1 << 10) | 0x8000;
2326     src_y += 2;
2327     src_u += 1;
2328     src_v += 1;
2329     dst_argb1555 += 4;  // Advance 2 pixels.
2330   }
2331   if (width & 1) {
2332     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2333     b0 = b0 >> 3;
2334     g0 = g0 >> 3;
2335     r0 = r0 >> 3;
2336     *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
2337   }
2338 }
2339 
I422ToRGB565Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)2340 void I422ToRGB565Row_C(const uint8_t* src_y,
2341                        const uint8_t* src_u,
2342                        const uint8_t* src_v,
2343                        uint8_t* dst_rgb565,
2344                        const struct YuvConstants* yuvconstants,
2345                        int width) {
2346   uint8_t b0;
2347   uint8_t g0;
2348   uint8_t r0;
2349   uint8_t b1;
2350   uint8_t g1;
2351   uint8_t r1;
2352   int x;
2353   for (x = 0; x < width - 1; x += 2) {
2354     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2355     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
2356     b0 = b0 >> 3;
2357     g0 = g0 >> 2;
2358     r0 = r0 >> 3;
2359     b1 = b1 >> 3;
2360     g1 = g1 >> 2;
2361     r1 = r1 >> 3;
2362     *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11);  // for ubsan
2363     *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11);
2364     src_y += 2;
2365     src_u += 1;
2366     src_v += 1;
2367     dst_rgb565 += 4;  // Advance 2 pixels.
2368   }
2369   if (width & 1) {
2370     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
2371     b0 = b0 >> 3;
2372     g0 = g0 >> 2;
2373     r0 = r0 >> 3;
2374     *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
2375   }
2376 }
2377 
NV12ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2378 void NV12ToARGBRow_C(const uint8_t* src_y,
2379                      const uint8_t* src_uv,
2380                      uint8_t* rgb_buf,
2381                      const struct YuvConstants* yuvconstants,
2382                      int width) {
2383   int x;
2384   for (x = 0; x < width - 1; x += 2) {
2385     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2386              rgb_buf + 2, yuvconstants);
2387     rgb_buf[3] = 255;
2388     YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
2389              rgb_buf + 6, yuvconstants);
2390     rgb_buf[7] = 255;
2391     src_y += 2;
2392     src_uv += 2;
2393     rgb_buf += 8;  // Advance 2 pixels.
2394   }
2395   if (width & 1) {
2396     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2397              rgb_buf + 2, yuvconstants);
2398     rgb_buf[3] = 255;
2399   }
2400 }
2401 
NV21ToARGBRow_C(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2402 void NV21ToARGBRow_C(const uint8_t* src_y,
2403                      const uint8_t* src_vu,
2404                      uint8_t* rgb_buf,
2405                      const struct YuvConstants* yuvconstants,
2406                      int width) {
2407   int x;
2408   for (x = 0; x < width - 1; x += 2) {
2409     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2410              rgb_buf + 2, yuvconstants);
2411     rgb_buf[3] = 255;
2412     YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
2413              rgb_buf + 6, yuvconstants);
2414     rgb_buf[7] = 255;
2415     src_y += 2;
2416     src_vu += 2;
2417     rgb_buf += 8;  // Advance 2 pixels.
2418   }
2419   if (width & 1) {
2420     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2421              rgb_buf + 2, yuvconstants);
2422     rgb_buf[3] = 255;
2423   }
2424 }
2425 
NV12ToRGB24Row_C(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2426 void NV12ToRGB24Row_C(const uint8_t* src_y,
2427                       const uint8_t* src_uv,
2428                       uint8_t* rgb_buf,
2429                       const struct YuvConstants* yuvconstants,
2430                       int width) {
2431   int x;
2432   for (x = 0; x < width - 1; x += 2) {
2433     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2434              rgb_buf + 2, yuvconstants);
2435     YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
2436              rgb_buf + 5, yuvconstants);
2437     src_y += 2;
2438     src_uv += 2;
2439     rgb_buf += 6;  // Advance 2 pixels.
2440   }
2441   if (width & 1) {
2442     YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
2443              rgb_buf + 2, yuvconstants);
2444   }
2445 }
2446 
NV21ToRGB24Row_C(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2447 void NV21ToRGB24Row_C(const uint8_t* src_y,
2448                       const uint8_t* src_vu,
2449                       uint8_t* rgb_buf,
2450                       const struct YuvConstants* yuvconstants,
2451                       int width) {
2452   int x;
2453   for (x = 0; x < width - 1; x += 2) {
2454     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2455              rgb_buf + 2, yuvconstants);
2456     YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
2457              rgb_buf + 5, yuvconstants);
2458     src_y += 2;
2459     src_vu += 2;
2460     rgb_buf += 6;  // Advance 2 pixels.
2461   }
2462   if (width & 1) {
2463     YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
2464              rgb_buf + 2, yuvconstants);
2465   }
2466 }
2467 
NV12ToRGB565Row_C(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)2468 void NV12ToRGB565Row_C(const uint8_t* src_y,
2469                        const uint8_t* src_uv,
2470                        uint8_t* dst_rgb565,
2471                        const struct YuvConstants* yuvconstants,
2472                        int width) {
2473   uint8_t b0;
2474   uint8_t g0;
2475   uint8_t r0;
2476   uint8_t b1;
2477   uint8_t g1;
2478   uint8_t r1;
2479   int x;
2480   for (x = 0; x < width - 1; x += 2) {
2481     YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
2482     YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
2483     b0 = b0 >> 3;
2484     g0 = g0 >> 2;
2485     r0 = r0 >> 3;
2486     b1 = b1 >> 3;
2487     g1 = g1 >> 2;
2488     r1 = r1 >> 3;
2489     *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11);
2490     *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11);
2491     src_y += 2;
2492     src_uv += 2;
2493     dst_rgb565 += 4;  // Advance 2 pixels.
2494   }
2495   if (width & 1) {
2496     YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
2497     b0 = b0 >> 3;
2498     g0 = g0 >> 2;
2499     r0 = r0 >> 3;
2500     *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
2501   }
2502 }
2503 
YUY2ToARGBRow_C(const uint8_t * src_yuy2,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2504 void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
2505                      uint8_t* rgb_buf,
2506                      const struct YuvConstants* yuvconstants,
2507                      int width) {
2508   int x;
2509   for (x = 0; x < width - 1; x += 2) {
2510     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
2511              rgb_buf + 2, yuvconstants);
2512     rgb_buf[3] = 255;
2513     YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
2514              rgb_buf + 6, yuvconstants);
2515     rgb_buf[7] = 255;
2516     src_yuy2 += 4;
2517     rgb_buf += 8;  // Advance 2 pixels.
2518   }
2519   if (width & 1) {
2520     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
2521              rgb_buf + 2, yuvconstants);
2522     rgb_buf[3] = 255;
2523   }
2524 }
2525 
UYVYToARGBRow_C(const uint8_t * src_uyvy,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2526 void UYVYToARGBRow_C(const uint8_t* src_uyvy,
2527                      uint8_t* rgb_buf,
2528                      const struct YuvConstants* yuvconstants,
2529                      int width) {
2530   int x;
2531   for (x = 0; x < width - 1; x += 2) {
2532     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
2533              rgb_buf + 2, yuvconstants);
2534     rgb_buf[3] = 255;
2535     YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
2536              rgb_buf + 6, yuvconstants);
2537     rgb_buf[7] = 255;
2538     src_uyvy += 4;
2539     rgb_buf += 8;  // Advance 2 pixels.
2540   }
2541   if (width & 1) {
2542     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
2543              rgb_buf + 2, yuvconstants);
2544     rgb_buf[3] = 255;
2545   }
2546 }
2547 
I422ToRGBARow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2548 void I422ToRGBARow_C(const uint8_t* src_y,
2549                      const uint8_t* src_u,
2550                      const uint8_t* src_v,
2551                      uint8_t* rgb_buf,
2552                      const struct YuvConstants* yuvconstants,
2553                      int width) {
2554   int x;
2555   for (x = 0; x < width - 1; x += 2) {
2556     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
2557              rgb_buf + 3, yuvconstants);
2558     rgb_buf[0] = 255;
2559     YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
2560              rgb_buf + 7, yuvconstants);
2561     rgb_buf[4] = 255;
2562     src_y += 2;
2563     src_u += 1;
2564     src_v += 1;
2565     rgb_buf += 8;  // Advance 2 pixels.
2566   }
2567   if (width & 1) {
2568     YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
2569              rgb_buf + 3, yuvconstants);
2570     rgb_buf[0] = 255;
2571   }
2572 }
2573 
I400ToARGBRow_C(const uint8_t * src_y,uint8_t * rgb_buf,const struct YuvConstants * yuvconstants,int width)2574 void I400ToARGBRow_C(const uint8_t* src_y,
2575                      uint8_t* rgb_buf,
2576                      const struct YuvConstants* yuvconstants,
2577                      int width) {
2578   int x;
2579   for (x = 0; x < width - 1; x += 2) {
2580     YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
2581     rgb_buf[3] = 255;
2582     YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
2583     rgb_buf[7] = 255;
2584     src_y += 2;
2585     rgb_buf += 8;  // Advance 2 pixels.
2586   }
2587   if (width & 1) {
2588     YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
2589     rgb_buf[3] = 255;
2590   }
2591 }
2592 
MirrorRow_C(const uint8_t * src,uint8_t * dst,int width)2593 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
2594   int x;
2595   src += width - 1;
2596   for (x = 0; x < width - 1; x += 2) {
2597     dst[x] = src[0];
2598     dst[x + 1] = src[-1];
2599     src -= 2;
2600   }
2601   if (width & 1) {
2602     dst[width - 1] = src[0];
2603   }
2604 }
2605 
MirrorUVRow_C(const uint8_t * src_uv,uint8_t * dst_uv,int width)2606 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
2607   int x;
2608   src_uv += (width - 1) << 1;
2609   for (x = 0; x < width; ++x) {
2610     dst_uv[0] = src_uv[0];
2611     dst_uv[1] = src_uv[1];
2612     src_uv -= 2;
2613     dst_uv += 2;
2614   }
2615 }
2616 
MirrorSplitUVRow_C(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)2617 void MirrorSplitUVRow_C(const uint8_t* src_uv,
2618                         uint8_t* dst_u,
2619                         uint8_t* dst_v,
2620                         int width) {
2621   int x;
2622   src_uv += (width - 1) << 1;
2623   for (x = 0; x < width - 1; x += 2) {
2624     dst_u[x] = src_uv[0];
2625     dst_u[x + 1] = src_uv[-2];
2626     dst_v[x] = src_uv[1];
2627     dst_v[x + 1] = src_uv[-2 + 1];
2628     src_uv -= 4;
2629   }
2630   if (width & 1) {
2631     dst_u[width - 1] = src_uv[0];
2632     dst_v[width - 1] = src_uv[1];
2633   }
2634 }
2635 
ARGBMirrorRow_C(const uint8_t * src,uint8_t * dst,int width)2636 void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
2637   int x;
2638   const uint32_t* src32 = (const uint32_t*)(src);
2639   uint32_t* dst32 = (uint32_t*)(dst);
2640   src32 += width - 1;
2641   for (x = 0; x < width - 1; x += 2) {
2642     dst32[x] = src32[0];
2643     dst32[x + 1] = src32[-1];
2644     src32 -= 2;
2645   }
2646   if (width & 1) {
2647     dst32[width - 1] = src32[0];
2648   }
2649 }
2650 
RGB24MirrorRow_C(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)2651 void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
2652   int x;
2653   src_rgb24 += width * 3 - 3;
2654   for (x = 0; x < width; ++x) {
2655     uint8_t b = src_rgb24[0];
2656     uint8_t g = src_rgb24[1];
2657     uint8_t r = src_rgb24[2];
2658     dst_rgb24[0] = b;
2659     dst_rgb24[1] = g;
2660     dst_rgb24[2] = r;
2661     src_rgb24 -= 3;
2662     dst_rgb24 += 3;
2663   }
2664 }
2665 
DetileRow_C(const uint8_t * src,ptrdiff_t src_tile_stride,uint8_t * dst,int width)2666 void DetileRow_C(const uint8_t* src,
2667                  ptrdiff_t src_tile_stride,
2668                  uint8_t* dst,
2669                  int width) {
2670   int x;
2671   for (x = 0; x < width - 15; x += 16) {
2672     memcpy(dst, src, 16);
2673     dst += 16;
2674     src += src_tile_stride;
2675   }
2676   if (width & 15) {
2677     memcpy(dst, src, width & 15);
2678   }
2679 }
2680 
DetileSplitUVRow_C(const uint8_t * src_uv,ptrdiff_t src_tile_stride,uint8_t * dst_u,uint8_t * dst_v,int width)2681 void DetileSplitUVRow_C(const uint8_t* src_uv,
2682                         ptrdiff_t src_tile_stride,
2683                         uint8_t* dst_u,
2684                         uint8_t* dst_v,
2685                         int width) {
2686   int tile;
2687   for (tile = 0; tile < width / 16; tile++) {
2688     for (int x = 0; x < 8; x++) {
2689       *dst_u++ = src_uv[0];
2690       *dst_v++ = src_uv[1];
2691       src_uv += 2;
2692     }
2693     src_uv += src_tile_stride - 16;
2694   }
2695   for (int x = 0; x < (width & 0xF) / 2; ++x) {
2696     *dst_u = *src_uv;
2697     dst_u++;
2698     src_uv++;
2699     *dst_v = *src_uv;
2700     dst_v++;
2701     src_uv++;
2702   }
2703 }
2704 
SplitUVRow_C(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)2705 void SplitUVRow_C(const uint8_t* src_uv,
2706                   uint8_t* dst_u,
2707                   uint8_t* dst_v,
2708                   int width) {
2709   int x;
2710   for (x = 0; x < width - 1; x += 2) {
2711     dst_u[x] = src_uv[0];
2712     dst_u[x + 1] = src_uv[2];
2713     dst_v[x] = src_uv[1];
2714     dst_v[x + 1] = src_uv[3];
2715     src_uv += 4;
2716   }
2717   if (width & 1) {
2718     dst_u[width - 1] = src_uv[0];
2719     dst_v[width - 1] = src_uv[1];
2720   }
2721 }
2722 
MergeUVRow_C(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)2723 void MergeUVRow_C(const uint8_t* src_u,
2724                   const uint8_t* src_v,
2725                   uint8_t* dst_uv,
2726                   int width) {
2727   int x;
2728   for (x = 0; x < width - 1; x += 2) {
2729     dst_uv[0] = src_u[x];
2730     dst_uv[1] = src_v[x];
2731     dst_uv[2] = src_u[x + 1];
2732     dst_uv[3] = src_v[x + 1];
2733     dst_uv += 4;
2734   }
2735   if (width & 1) {
2736     dst_uv[0] = src_u[width - 1];
2737     dst_uv[1] = src_v[width - 1];
2738   }
2739 }
2740 
SplitRGBRow_C(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)2741 void SplitRGBRow_C(const uint8_t* src_rgb,
2742                    uint8_t* dst_r,
2743                    uint8_t* dst_g,
2744                    uint8_t* dst_b,
2745                    int width) {
2746   int x;
2747   for (x = 0; x < width; ++x) {
2748     dst_r[x] = src_rgb[0];
2749     dst_g[x] = src_rgb[1];
2750     dst_b[x] = src_rgb[2];
2751     src_rgb += 3;
2752   }
2753 }
2754 
MergeRGBRow_C(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)2755 void MergeRGBRow_C(const uint8_t* src_r,
2756                    const uint8_t* src_g,
2757                    const uint8_t* src_b,
2758                    uint8_t* dst_rgb,
2759                    int width) {
2760   int x;
2761   for (x = 0; x < width; ++x) {
2762     dst_rgb[0] = src_r[x];
2763     dst_rgb[1] = src_g[x];
2764     dst_rgb[2] = src_b[x];
2765     dst_rgb += 3;
2766   }
2767 }
2768 
SplitARGBRow_C(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)2769 void SplitARGBRow_C(const uint8_t* src_argb,
2770                     uint8_t* dst_r,
2771                     uint8_t* dst_g,
2772                     uint8_t* dst_b,
2773                     uint8_t* dst_a,
2774                     int width) {
2775   int x;
2776   for (x = 0; x < width; ++x) {
2777     dst_b[x] = src_argb[0];
2778     dst_g[x] = src_argb[1];
2779     dst_r[x] = src_argb[2];
2780     dst_a[x] = src_argb[3];
2781     src_argb += 4;
2782   }
2783 }
2784 
MergeARGBRow_C(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)2785 void MergeARGBRow_C(const uint8_t* src_r,
2786                     const uint8_t* src_g,
2787                     const uint8_t* src_b,
2788                     const uint8_t* src_a,
2789                     uint8_t* dst_argb,
2790                     int width) {
2791   int x;
2792   for (x = 0; x < width; ++x) {
2793     dst_argb[0] = src_b[x];
2794     dst_argb[1] = src_g[x];
2795     dst_argb[2] = src_r[x];
2796     dst_argb[3] = src_a[x];
2797     dst_argb += 4;
2798   }
2799 }
2800 
MergeXR30Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)2801 void MergeXR30Row_C(const uint16_t* src_r,
2802                     const uint16_t* src_g,
2803                     const uint16_t* src_b,
2804                     uint8_t* dst_ar30,
2805                     int depth,
2806                     int width) {
2807   assert(depth >= 10);
2808   assert(depth <= 16);
2809   int x;
2810   int shift = depth - 10;
2811   uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
2812   for (x = 0; x < width; ++x) {
2813     uint32_t r = clamp1023(src_r[x] >> shift);
2814     uint32_t g = clamp1023(src_g[x] >> shift);
2815     uint32_t b = clamp1023(src_b[x] >> shift);
2816     dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
2817   }
2818 }
2819 
MergeAR64Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)2820 void MergeAR64Row_C(const uint16_t* src_r,
2821                     const uint16_t* src_g,
2822                     const uint16_t* src_b,
2823                     const uint16_t* src_a,
2824                     uint16_t* dst_ar64,
2825                     int depth,
2826                     int width) {
2827   assert(depth >= 1);
2828   assert(depth <= 16);
2829   int x;
2830   int shift = 16 - depth;
2831   int max = (1 << depth) - 1;
2832   for (x = 0; x < width; ++x) {
2833     dst_ar64[0] = ClampMax(src_b[x], max) << shift;
2834     dst_ar64[1] = ClampMax(src_g[x], max) << shift;
2835     dst_ar64[2] = ClampMax(src_r[x], max) << shift;
2836     dst_ar64[3] = ClampMax(src_a[x], max) << shift;
2837     dst_ar64 += 4;
2838   }
2839 }
2840 
MergeARGB16To8Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)2841 void MergeARGB16To8Row_C(const uint16_t* src_r,
2842                          const uint16_t* src_g,
2843                          const uint16_t* src_b,
2844                          const uint16_t* src_a,
2845                          uint8_t* dst_argb,
2846                          int depth,
2847                          int width) {
2848   assert(depth >= 8);
2849   assert(depth <= 16);
2850   int x;
2851   int shift = depth - 8;
2852   for (x = 0; x < width; ++x) {
2853     dst_argb[0] = clamp255(src_b[x] >> shift);
2854     dst_argb[1] = clamp255(src_g[x] >> shift);
2855     dst_argb[2] = clamp255(src_r[x] >> shift);
2856     dst_argb[3] = clamp255(src_a[x] >> shift);
2857     dst_argb += 4;
2858   }
2859 }
2860 
MergeXR64Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)2861 void MergeXR64Row_C(const uint16_t* src_r,
2862                     const uint16_t* src_g,
2863                     const uint16_t* src_b,
2864                     uint16_t* dst_ar64,
2865                     int depth,
2866                     int width) {
2867   assert(depth >= 1);
2868   assert(depth <= 16);
2869   int x;
2870   int shift = 16 - depth;
2871   int max = (1 << depth) - 1;
2872   for (x = 0; x < width; ++x) {
2873     dst_ar64[0] = ClampMax(src_b[x], max) << shift;
2874     dst_ar64[1] = ClampMax(src_g[x], max) << shift;
2875     dst_ar64[2] = ClampMax(src_r[x], max) << shift;
2876     dst_ar64[3] = 0xffff;
2877     dst_ar64 += 4;
2878   }
2879 }
2880 
MergeXRGB16To8Row_C(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)2881 void MergeXRGB16To8Row_C(const uint16_t* src_r,
2882                          const uint16_t* src_g,
2883                          const uint16_t* src_b,
2884                          uint8_t* dst_argb,
2885                          int depth,
2886                          int width) {
2887   assert(depth >= 8);
2888   assert(depth <= 16);
2889   int x;
2890   int shift = depth - 8;
2891   for (x = 0; x < width; ++x) {
2892     dst_argb[0] = clamp255(src_b[x] >> shift);
2893     dst_argb[1] = clamp255(src_g[x] >> shift);
2894     dst_argb[2] = clamp255(src_r[x] >> shift);
2895     dst_argb[3] = 0xff;
2896     dst_argb += 4;
2897   }
2898 }
2899 
SplitXRGBRow_C(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)2900 void SplitXRGBRow_C(const uint8_t* src_argb,
2901                     uint8_t* dst_r,
2902                     uint8_t* dst_g,
2903                     uint8_t* dst_b,
2904                     int width) {
2905   int x;
2906   for (x = 0; x < width; ++x) {
2907     dst_b[x] = src_argb[0];
2908     dst_g[x] = src_argb[1];
2909     dst_r[x] = src_argb[2];
2910     src_argb += 4;
2911   }
2912 }
2913 
MergeXRGBRow_C(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)2914 void MergeXRGBRow_C(const uint8_t* src_r,
2915                     const uint8_t* src_g,
2916                     const uint8_t* src_b,
2917                     uint8_t* dst_argb,
2918                     int width) {
2919   int x;
2920   for (x = 0; x < width; ++x) {
2921     dst_argb[0] = src_b[x];
2922     dst_argb[1] = src_g[x];
2923     dst_argb[2] = src_r[x];
2924     dst_argb[3] = 255;
2925     dst_argb += 4;
2926   }
2927 }
2928 
2929 // Convert lsb formats to msb, depending on sample depth.
MergeUVRow_16_C(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)2930 void MergeUVRow_16_C(const uint16_t* src_u,
2931                      const uint16_t* src_v,
2932                      uint16_t* dst_uv,
2933                      int depth,
2934                      int width) {
2935   int shift = 16 - depth;
2936   assert(depth >= 8);
2937   assert(depth <= 16);
2938   int x;
2939   for (x = 0; x < width; ++x) {
2940     dst_uv[0] = src_u[x] << shift;
2941     dst_uv[1] = src_v[x] << shift;
2942     dst_uv += 2;
2943   }
2944 }
2945 
2946 // Convert msb formats to lsb, depending on sample depth.
SplitUVRow_16_C(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)2947 void SplitUVRow_16_C(const uint16_t* src_uv,
2948                      uint16_t* dst_u,
2949                      uint16_t* dst_v,
2950                      int depth,
2951                      int width) {
2952   int shift = 16 - depth;
2953   int x;
2954   assert(depth >= 8);
2955   assert(depth <= 16);
2956   for (x = 0; x < width; ++x) {
2957     dst_u[x] = src_uv[0] >> shift;
2958     dst_v[x] = src_uv[1] >> shift;
2959     src_uv += 2;
2960   }
2961 }
2962 
MultiplyRow_16_C(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)2963 void MultiplyRow_16_C(const uint16_t* src_y,
2964                       uint16_t* dst_y,
2965                       int scale,
2966                       int width) {
2967   int x;
2968   for (x = 0; x < width; ++x) {
2969     dst_y[x] = src_y[x] * scale;
2970   }
2971 }
2972 
DivideRow_16_C(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)2973 void DivideRow_16_C(const uint16_t* src_y,
2974                     uint16_t* dst_y,
2975                     int scale,
2976                     int width) {
2977   int x;
2978   for (x = 0; x < width; ++x) {
2979     dst_y[x] = (src_y[x] * scale) >> 16;
2980   }
2981 }
2982 
2983 // Use scale to convert lsb formats to msb, depending how many bits there are:
2984 // 32768 = 9 bits
2985 // 16384 = 10 bits
2986 // 4096 = 12 bits
2987 // 256 = 16 bits
2988 // TODO(fbarchard): change scale to bits
2989 #define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
2990 
Convert16To8Row_C(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)2991 void Convert16To8Row_C(const uint16_t* src_y,
2992                        uint8_t* dst_y,
2993                        int scale,
2994                        int width) {
2995   int x;
2996   assert(scale >= 256);
2997   assert(scale <= 32768);
2998 
2999   for (x = 0; x < width; ++x) {
3000     dst_y[x] = C16TO8(src_y[x], scale);
3001   }
3002 }
3003 
3004 // Use scale to convert lsb formats to msb, depending how many bits there are:
3005 // 1024 = 10 bits
Convert8To16Row_C(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)3006 void Convert8To16Row_C(const uint8_t* src_y,
3007                        uint16_t* dst_y,
3008                        int scale,
3009                        int width) {
3010   int x;
3011   scale *= 0x0101;  // replicates the byte.
3012   for (x = 0; x < width; ++x) {
3013     dst_y[x] = (src_y[x] * scale) >> 16;
3014   }
3015 }
3016 
CopyRow_C(const uint8_t * src,uint8_t * dst,int count)3017 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
3018   memcpy(dst, src, count);
3019 }
3020 
CopyRow_16_C(const uint16_t * src,uint16_t * dst,int count)3021 void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {
3022   memcpy(dst, src, count * 2);
3023 }
3024 
SetRow_C(uint8_t * dst,uint8_t v8,int width)3025 void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
3026   memset(dst, v8, width);
3027 }
3028 
ARGBSetRow_C(uint8_t * dst_argb,uint32_t v32,int width)3029 void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
3030   int x;
3031   for (x = 0; x < width; ++x) {
3032     memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
3033   }
3034 }
3035 
3036 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
YUY2ToUVRow_C(const uint8_t * src_yuy2,int src_stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)3037 void YUY2ToUVRow_C(const uint8_t* src_yuy2,
3038                    int src_stride_yuy2,
3039                    uint8_t* dst_u,
3040                    uint8_t* dst_v,
3041                    int width) {
3042   // Output a row of UV values, filtering 2 rows of YUY2.
3043   int x;
3044   for (x = 0; x < width; x += 2) {
3045     dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
3046     dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
3047     src_yuy2 += 4;
3048     dst_u += 1;
3049     dst_v += 1;
3050   }
3051 }
3052 
3053 // Copy row of YUY2 UV's (422) into U and V (422).
YUY2ToUV422Row_C(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)3054 void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
3055                       uint8_t* dst_u,
3056                       uint8_t* dst_v,
3057                       int width) {
3058   // Output a row of UV values.
3059   int x;
3060   for (x = 0; x < width; x += 2) {
3061     dst_u[0] = src_yuy2[1];
3062     dst_v[0] = src_yuy2[3];
3063     src_yuy2 += 4;
3064     dst_u += 1;
3065     dst_v += 1;
3066   }
3067 }
3068 
3069 // Copy row of YUY2 Y's (422) into Y (420/422).
YUY2ToYRow_C(const uint8_t * src_yuy2,uint8_t * dst_y,int width)3070 void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
3071   // Output a row of Y values.
3072   int x;
3073   for (x = 0; x < width - 1; x += 2) {
3074     dst_y[x] = src_yuy2[0];
3075     dst_y[x + 1] = src_yuy2[2];
3076     src_yuy2 += 4;
3077   }
3078   if (width & 1) {
3079     dst_y[width - 1] = src_yuy2[0];
3080   }
3081 }
3082 
3083 // Filter 2 rows of UYVY UV's (422) into U and V (420).
UYVYToUVRow_C(const uint8_t * src_uyvy,int src_stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)3084 void UYVYToUVRow_C(const uint8_t* src_uyvy,
3085                    int src_stride_uyvy,
3086                    uint8_t* dst_u,
3087                    uint8_t* dst_v,
3088                    int width) {
3089   // Output a row of UV values.
3090   int x;
3091   for (x = 0; x < width; x += 2) {
3092     dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
3093     dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
3094     src_uyvy += 4;
3095     dst_u += 1;
3096     dst_v += 1;
3097   }
3098 }
3099 
3100 // Copy row of UYVY UV's (422) into U and V (422).
UYVYToUV422Row_C(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)3101 void UYVYToUV422Row_C(const uint8_t* src_uyvy,
3102                       uint8_t* dst_u,
3103                       uint8_t* dst_v,
3104                       int width) {
3105   // Output a row of UV values.
3106   int x;
3107   for (x = 0; x < width; x += 2) {
3108     dst_u[0] = src_uyvy[0];
3109     dst_v[0] = src_uyvy[2];
3110     src_uyvy += 4;
3111     dst_u += 1;
3112     dst_v += 1;
3113   }
3114 }
3115 
3116 // Copy row of UYVY Y's (422) into Y (420/422).
UYVYToYRow_C(const uint8_t * src_uyvy,uint8_t * dst_y,int width)3117 void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
3118   // Output a row of Y values.
3119   int x;
3120   for (x = 0; x < width - 1; x += 2) {
3121     dst_y[x] = src_uyvy[1];
3122     dst_y[x + 1] = src_uyvy[3];
3123     src_uyvy += 4;
3124   }
3125   if (width & 1) {
3126     dst_y[width - 1] = src_uyvy[1];
3127   }
3128 }
3129 
3130 #define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
3131 
3132 // Blend src_argb over src_argb1 and store to dst_argb.
3133 // dst_argb may be src_argb or src_argb1.
3134 // This code mimics the SSSE3 version for better testability.
ARGBBlendRow_C(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3135 void ARGBBlendRow_C(const uint8_t* src_argb,
3136                     const uint8_t* src_argb1,
3137                     uint8_t* dst_argb,
3138                     int width) {
3139   int x;
3140   for (x = 0; x < width - 1; x += 2) {
3141     uint32_t fb = src_argb[0];
3142     uint32_t fg = src_argb[1];
3143     uint32_t fr = src_argb[2];
3144     uint32_t a = src_argb[3];
3145     uint32_t bb = src_argb1[0];
3146     uint32_t bg = src_argb1[1];
3147     uint32_t br = src_argb1[2];
3148     dst_argb[0] = BLEND(fb, bb, a);
3149     dst_argb[1] = BLEND(fg, bg, a);
3150     dst_argb[2] = BLEND(fr, br, a);
3151     dst_argb[3] = 255u;
3152 
3153     fb = src_argb[4 + 0];
3154     fg = src_argb[4 + 1];
3155     fr = src_argb[4 + 2];
3156     a = src_argb[4 + 3];
3157     bb = src_argb1[4 + 0];
3158     bg = src_argb1[4 + 1];
3159     br = src_argb1[4 + 2];
3160     dst_argb[4 + 0] = BLEND(fb, bb, a);
3161     dst_argb[4 + 1] = BLEND(fg, bg, a);
3162     dst_argb[4 + 2] = BLEND(fr, br, a);
3163     dst_argb[4 + 3] = 255u;
3164     src_argb += 8;
3165     src_argb1 += 8;
3166     dst_argb += 8;
3167   }
3168 
3169   if (width & 1) {
3170     uint32_t fb = src_argb[0];
3171     uint32_t fg = src_argb[1];
3172     uint32_t fr = src_argb[2];
3173     uint32_t a = src_argb[3];
3174     uint32_t bb = src_argb1[0];
3175     uint32_t bg = src_argb1[1];
3176     uint32_t br = src_argb1[2];
3177     dst_argb[0] = BLEND(fb, bb, a);
3178     dst_argb[1] = BLEND(fg, bg, a);
3179     dst_argb[2] = BLEND(fr, br, a);
3180     dst_argb[3] = 255u;
3181   }
3182 }
3183 #undef BLEND
3184 
3185 #define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
BlendPlaneRow_C(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)3186 void BlendPlaneRow_C(const uint8_t* src0,
3187                      const uint8_t* src1,
3188                      const uint8_t* alpha,
3189                      uint8_t* dst,
3190                      int width) {
3191   int x;
3192   for (x = 0; x < width - 1; x += 2) {
3193     dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
3194     dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
3195     src0 += 2;
3196     src1 += 2;
3197     alpha += 2;
3198     dst += 2;
3199   }
3200   if (width & 1) {
3201     dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
3202   }
3203 }
3204 #undef UBLEND
3205 
3206 #if LIBYUV_ATTENUATE_DUP
3207 // This code mimics the SSSE3 version for better testability.
3208 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
3209 #else
3210 #define ATTENUATE(f, a) (f * a + 128) >> 8
3211 #endif
3212 
3213 // Multiply source RGB by alpha and store to destination.
ARGBAttenuateRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width)3214 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
3215   int i;
3216   for (i = 0; i < width - 1; i += 2) {
3217     uint32_t b = src_argb[0];
3218     uint32_t g = src_argb[1];
3219     uint32_t r = src_argb[2];
3220     uint32_t a = src_argb[3];
3221     dst_argb[0] = ATTENUATE(b, a);
3222     dst_argb[1] = ATTENUATE(g, a);
3223     dst_argb[2] = ATTENUATE(r, a);
3224     dst_argb[3] = a;
3225     b = src_argb[4];
3226     g = src_argb[5];
3227     r = src_argb[6];
3228     a = src_argb[7];
3229     dst_argb[4] = ATTENUATE(b, a);
3230     dst_argb[5] = ATTENUATE(g, a);
3231     dst_argb[6] = ATTENUATE(r, a);
3232     dst_argb[7] = a;
3233     src_argb += 8;
3234     dst_argb += 8;
3235   }
3236 
3237   if (width & 1) {
3238     const uint32_t b = src_argb[0];
3239     const uint32_t g = src_argb[1];
3240     const uint32_t r = src_argb[2];
3241     const uint32_t a = src_argb[3];
3242     dst_argb[0] = ATTENUATE(b, a);
3243     dst_argb[1] = ATTENUATE(g, a);
3244     dst_argb[2] = ATTENUATE(r, a);
3245     dst_argb[3] = a;
3246   }
3247 }
3248 #undef ATTENUATE
3249 
3250 // Divide source RGB by alpha and store to destination.
3251 // b = (b * 255 + (a / 2)) / a;
3252 // g = (g * 255 + (a / 2)) / a;
3253 // r = (r * 255 + (a / 2)) / a;
3254 // Reciprocal method is off by 1 on some values. ie 125
3255 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
3256 #define T(a) 0x01000000 + (0x10000 / a)
3257 const uint32_t fixed_invtbl8[256] = {
3258     0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),
3259     T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),
3260     T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),
3261     T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),
3262     T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),
3263     T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),
3264     T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),
3265     T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),
3266     T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),
3267     T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),
3268     T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),
3269     T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),
3270     T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),
3271     T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),
3272     T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),
3273     T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),
3274     T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),
3275     T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),
3276     T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),
3277     T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),
3278     T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),
3279     T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),
3280     T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),
3281     T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),
3282     T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),
3283     T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),
3284     T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),
3285     T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),
3286     T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),
3287     T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),
3288     T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),
3289     T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),
3290     T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),
3291     T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),
3292     T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),
3293     T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),
3294     T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
3295 #undef T
3296 
3297 #if LIBYUV_UNATTENUATE_DUP
3298 // This code mimics the Intel SIMD version for better testability.
3299 #define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16)
3300 #else
3301 #define UNATTENUATE(f, ia) clamp255((f * ia) >> 8)
3302 #endif
3303 
3304 // mimics the Intel SIMD code for exactness.
ARGBUnattenuateRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width)3305 void ARGBUnattenuateRow_C(const uint8_t* src_argb,
3306                           uint8_t* dst_argb,
3307                           int width) {
3308   int i;
3309   for (i = 0; i < width; ++i) {
3310     uint32_t b = src_argb[0];
3311     uint32_t g = src_argb[1];
3312     uint32_t r = src_argb[2];
3313     const uint32_t a = src_argb[3];
3314     const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
3315 
3316     // Clamping should not be necessary but is free in assembly.
3317     dst_argb[0] = UNATTENUATE(b, ia);
3318     dst_argb[1] = UNATTENUATE(g, ia);
3319     dst_argb[2] = UNATTENUATE(r, ia);
3320     dst_argb[3] = a;
3321     src_argb += 4;
3322     dst_argb += 4;
3323   }
3324 }
3325 
ComputeCumulativeSumRow_C(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)3326 void ComputeCumulativeSumRow_C(const uint8_t* row,
3327                                int32_t* cumsum,
3328                                const int32_t* previous_cumsum,
3329                                int width) {
3330   int32_t row_sum[4] = {0, 0, 0, 0};
3331   int x;
3332   for (x = 0; x < width; ++x) {
3333     row_sum[0] += row[x * 4 + 0];
3334     row_sum[1] += row[x * 4 + 1];
3335     row_sum[2] += row[x * 4 + 2];
3336     row_sum[3] += row[x * 4 + 3];
3337     cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
3338     cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
3339     cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
3340     cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
3341   }
3342 }
3343 
CumulativeSumToAverageRow_C(const int32_t * tl,const int32_t * bl,int w,int area,uint8_t * dst,int count)3344 void CumulativeSumToAverageRow_C(const int32_t* tl,
3345                                  const int32_t* bl,
3346                                  int w,
3347                                  int area,
3348                                  uint8_t* dst,
3349                                  int count) {
3350   float ooa;
3351   int i;
3352   assert(area != 0);
3353 
3354   ooa = 1.0f / area;
3355   for (i = 0; i < count; ++i) {
3356     dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
3357     dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
3358     dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
3359     dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
3360     dst += 4;
3361     tl += 4;
3362     bl += 4;
3363   }
3364 }
3365 
3366 // Copy pixels from rotated source to destination row with a slope.
3367 LIBYUV_API
ARGBAffineRow_C(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * uv_dudv,int width)3368 void ARGBAffineRow_C(const uint8_t* src_argb,
3369                      int src_argb_stride,
3370                      uint8_t* dst_argb,
3371                      const float* uv_dudv,
3372                      int width) {
3373   int i;
3374   // Render a row of pixels from source into a buffer.
3375   float uv[2];
3376   uv[0] = uv_dudv[0];
3377   uv[1] = uv_dudv[1];
3378   for (i = 0; i < width; ++i) {
3379     int x = (int)(uv[0]);
3380     int y = (int)(uv[1]);
3381     *(uint32_t*)(dst_argb) =
3382         *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
3383     dst_argb += 4;
3384     uv[0] += uv_dudv[2];
3385     uv[1] += uv_dudv[3];
3386   }
3387 }
3388 
3389 // Blend 2 rows into 1.
HalfRow_C(const uint8_t * src_uv,ptrdiff_t src_uv_stride,uint8_t * dst_uv,int width)3390 static void HalfRow_C(const uint8_t* src_uv,
3391                       ptrdiff_t src_uv_stride,
3392                       uint8_t* dst_uv,
3393                       int width) {
3394   int x;
3395   for (x = 0; x < width; ++x) {
3396     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
3397   }
3398 }
3399 
HalfRow_16_C(const uint16_t * src_uv,ptrdiff_t src_uv_stride,uint16_t * dst_uv,int width)3400 static void HalfRow_16_C(const uint16_t* src_uv,
3401                          ptrdiff_t src_uv_stride,
3402                          uint16_t* dst_uv,
3403                          int width) {
3404   int x;
3405   for (x = 0; x < width; ++x) {
3406     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
3407   }
3408 }
3409 
HalfRow_16To8_C(const uint16_t * src_uv,ptrdiff_t src_uv_stride,uint8_t * dst_uv,int scale,int width)3410 static void HalfRow_16To8_C(const uint16_t* src_uv,
3411                             ptrdiff_t src_uv_stride,
3412                             uint8_t* dst_uv,
3413                             int scale,
3414                             int width) {
3415   int x;
3416   for (x = 0; x < width; ++x) {
3417     dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale);
3418   }
3419 }
3420 
3421 // C version 2x2 -> 2x1.
InterpolateRow_C(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)3422 void InterpolateRow_C(uint8_t* dst_ptr,
3423                       const uint8_t* src_ptr,
3424                       ptrdiff_t src_stride,
3425                       int width,
3426                       int source_y_fraction) {
3427   int y1_fraction = source_y_fraction;
3428   int y0_fraction = 256 - y1_fraction;
3429   const uint8_t* src_ptr1 = src_ptr + src_stride;
3430   int x;
3431   assert(source_y_fraction >= 0);
3432   assert(source_y_fraction < 256);
3433 
3434   if (y1_fraction == 0) {
3435     memcpy(dst_ptr, src_ptr, width);
3436     return;
3437   }
3438   if (y1_fraction == 128) {
3439     HalfRow_C(src_ptr, src_stride, dst_ptr, width);
3440     return;
3441   }
3442   for (x = 0; x < width; ++x) {
3443     dst_ptr[0] =
3444         (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
3445     ++src_ptr;
3446     ++src_ptr1;
3447     ++dst_ptr;
3448   }
3449 }
3450 
3451 // C version 2x2 -> 2x1.
InterpolateRow_16_C(uint16_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)3452 void InterpolateRow_16_C(uint16_t* dst_ptr,
3453                          const uint16_t* src_ptr,
3454                          ptrdiff_t src_stride,
3455                          int width,
3456                          int source_y_fraction) {
3457   int y1_fraction = source_y_fraction;
3458   int y0_fraction = 256 - y1_fraction;
3459   const uint16_t* src_ptr1 = src_ptr + src_stride;
3460   int x;
3461   assert(source_y_fraction >= 0);
3462   assert(source_y_fraction < 256);
3463 
3464   if (y1_fraction == 0) {
3465     memcpy(dst_ptr, src_ptr, width * 2);
3466     return;
3467   }
3468   if (y1_fraction == 128) {
3469     HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
3470     return;
3471   }
3472   for (x = 0; x < width; ++x) {
3473     dst_ptr[0] =
3474         (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
3475     ++src_ptr;
3476     ++src_ptr1;
3477     ++dst_ptr;
3478   }
3479 }
3480 
3481 // C version 2x2 16 bit-> 2x1 8 bit.
3482 // Use scale to convert lsb formats to msb, depending how many bits there are:
3483 // 32768 = 9 bits
3484 // 16384 = 10 bits
3485 // 4096 = 12 bits
3486 // 256 = 16 bits
3487 // TODO(fbarchard): change scale to bits
3488 
InterpolateRow_16To8_C(uint8_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int scale,int width,int source_y_fraction)3489 void InterpolateRow_16To8_C(uint8_t* dst_ptr,
3490                             const uint16_t* src_ptr,
3491                             ptrdiff_t src_stride,
3492                             int scale,
3493                             int width,
3494                             int source_y_fraction) {
3495   int y1_fraction = source_y_fraction;
3496   int y0_fraction = 256 - y1_fraction;
3497   const uint16_t* src_ptr1 = src_ptr + src_stride;
3498   int x;
3499   assert(source_y_fraction >= 0);
3500   assert(source_y_fraction < 256);
3501 
3502   if (source_y_fraction == 0) {
3503     Convert16To8Row_C(src_ptr, dst_ptr, scale, width);
3504     return;
3505   }
3506   if (source_y_fraction == 128) {
3507     HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width);
3508     return;
3509   }
3510   for (x = 0; x < width; ++x) {
3511     dst_ptr[0] = C16TO8(
3512         (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
3513         scale);
3514     src_ptr += 1;
3515     src_ptr1 += 1;
3516     dst_ptr += 1;
3517   }
3518 }
3519 
3520 // Use first 4 shuffler values to reorder ARGB channels.
ARGBShuffleRow_C(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)3521 void ARGBShuffleRow_C(const uint8_t* src_argb,
3522                       uint8_t* dst_argb,
3523                       const uint8_t* shuffler,
3524                       int width) {
3525   int index0 = shuffler[0];
3526   int index1 = shuffler[1];
3527   int index2 = shuffler[2];
3528   int index3 = shuffler[3];
3529   // Shuffle a row of ARGB.
3530   int x;
3531   for (x = 0; x < width; ++x) {
3532     // To support in-place conversion.
3533     uint8_t b = src_argb[index0];
3534     uint8_t g = src_argb[index1];
3535     uint8_t r = src_argb[index2];
3536     uint8_t a = src_argb[index3];
3537     dst_argb[0] = b;
3538     dst_argb[1] = g;
3539     dst_argb[2] = r;
3540     dst_argb[3] = a;
3541     src_argb += 4;
3542     dst_argb += 4;
3543   }
3544 }
3545 
I422ToYUY2Row_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_frame,int width)3546 void I422ToYUY2Row_C(const uint8_t* src_y,
3547                      const uint8_t* src_u,
3548                      const uint8_t* src_v,
3549                      uint8_t* dst_frame,
3550                      int width) {
3551   int x;
3552   for (x = 0; x < width - 1; x += 2) {
3553     dst_frame[0] = src_y[0];
3554     dst_frame[1] = src_u[0];
3555     dst_frame[2] = src_y[1];
3556     dst_frame[3] = src_v[0];
3557     dst_frame += 4;
3558     src_y += 2;
3559     src_u += 1;
3560     src_v += 1;
3561   }
3562   if (width & 1) {
3563     dst_frame[0] = src_y[0];
3564     dst_frame[1] = src_u[0];
3565     dst_frame[2] = 0;
3566     dst_frame[3] = src_v[0];
3567   }
3568 }
3569 
I422ToUYVYRow_C(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_frame,int width)3570 void I422ToUYVYRow_C(const uint8_t* src_y,
3571                      const uint8_t* src_u,
3572                      const uint8_t* src_v,
3573                      uint8_t* dst_frame,
3574                      int width) {
3575   int x;
3576   for (x = 0; x < width - 1; x += 2) {
3577     dst_frame[0] = src_u[0];
3578     dst_frame[1] = src_y[0];
3579     dst_frame[2] = src_v[0];
3580     dst_frame[3] = src_y[1];
3581     dst_frame += 4;
3582     src_y += 2;
3583     src_u += 1;
3584     src_v += 1;
3585   }
3586   if (width & 1) {
3587     dst_frame[0] = src_u[0];
3588     dst_frame[1] = src_y[0];
3589     dst_frame[2] = src_v[0];
3590     dst_frame[3] = 0;
3591   }
3592 }
3593 
ARGBPolynomialRow_C(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)3594 void ARGBPolynomialRow_C(const uint8_t* src_argb,
3595                          uint8_t* dst_argb,
3596                          const float* poly,
3597                          int width) {
3598   int i;
3599   for (i = 0; i < width; ++i) {
3600     float b = (float)(src_argb[0]);
3601     float g = (float)(src_argb[1]);
3602     float r = (float)(src_argb[2]);
3603     float a = (float)(src_argb[3]);
3604     float b2 = b * b;
3605     float g2 = g * g;
3606     float r2 = r * r;
3607     float a2 = a * a;
3608     float db = poly[0] + poly[4] * b;
3609     float dg = poly[1] + poly[5] * g;
3610     float dr = poly[2] + poly[6] * r;
3611     float da = poly[3] + poly[7] * a;
3612     float b3 = b2 * b;
3613     float g3 = g2 * g;
3614     float r3 = r2 * r;
3615     float a3 = a2 * a;
3616     db += poly[8] * b2;
3617     dg += poly[9] * g2;
3618     dr += poly[10] * r2;
3619     da += poly[11] * a2;
3620     db += poly[12] * b3;
3621     dg += poly[13] * g3;
3622     dr += poly[14] * r3;
3623     da += poly[15] * a3;
3624 
3625     dst_argb[0] = Clamp((int32_t)(db));
3626     dst_argb[1] = Clamp((int32_t)(dg));
3627     dst_argb[2] = Clamp((int32_t)(dr));
3628     dst_argb[3] = Clamp((int32_t)(da));
3629     src_argb += 4;
3630     dst_argb += 4;
3631   }
3632 }
3633 
3634 // Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
3635 // adjust the source integer range to the half float range desired.
3636 
3637 // This magic constant is 2^-112. Multiplying by this
3638 // is the same as subtracting 112 from the exponent, which
3639 // is the difference in exponent bias between 32-bit and
3640 // 16-bit floats. Once we've done this subtraction, we can
3641 // simply extract the low bits of the exponent and the high
3642 // bits of the mantissa from our float and we're done.
3643 
3644 // Work around GCC 7 punning warning -Wstrict-aliasing
3645 #if defined(__GNUC__)
3646 typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
3647 #else
3648 typedef uint32_t uint32_alias_t;
3649 #endif
3650 
HalfFloatRow_C(const uint16_t * src,uint16_t * dst,float scale,int width)3651 void HalfFloatRow_C(const uint16_t* src,
3652                     uint16_t* dst,
3653                     float scale,
3654                     int width) {
3655   int i;
3656   float mult = 1.9259299444e-34f * scale;
3657   for (i = 0; i < width; ++i) {
3658     float value = src[i] * mult;
3659     dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13);
3660   }
3661 }
3662 
ByteToFloatRow_C(const uint8_t * src,float * dst,float scale,int width)3663 void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
3664   int i;
3665   for (i = 0; i < width; ++i) {
3666     float value = src[i] * scale;
3667     dst[i] = value;
3668   }
3669 }
3670 
ARGBLumaColorTableRow_C(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)3671 void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
3672                              uint8_t* dst_argb,
3673                              int width,
3674                              const uint8_t* luma,
3675                              uint32_t lumacoeff) {
3676   uint32_t bc = lumacoeff & 0xff;
3677   uint32_t gc = (lumacoeff >> 8) & 0xff;
3678   uint32_t rc = (lumacoeff >> 16) & 0xff;
3679 
3680   int i;
3681   for (i = 0; i < width - 1; i += 2) {
3682     // Luminance in rows, color values in columns.
3683     const uint8_t* luma0 =
3684         ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
3685         luma;
3686     const uint8_t* luma1;
3687     dst_argb[0] = luma0[src_argb[0]];
3688     dst_argb[1] = luma0[src_argb[1]];
3689     dst_argb[2] = luma0[src_argb[2]];
3690     dst_argb[3] = src_argb[3];
3691     luma1 =
3692         ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
3693         luma;
3694     dst_argb[4] = luma1[src_argb[4]];
3695     dst_argb[5] = luma1[src_argb[5]];
3696     dst_argb[6] = luma1[src_argb[6]];
3697     dst_argb[7] = src_argb[7];
3698     src_argb += 8;
3699     dst_argb += 8;
3700   }
3701   if (width & 1) {
3702     // Luminance in rows, color values in columns.
3703     const uint8_t* luma0 =
3704         ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
3705         luma;
3706     dst_argb[0] = luma0[src_argb[0]];
3707     dst_argb[1] = luma0[src_argb[1]];
3708     dst_argb[2] = luma0[src_argb[2]];
3709     dst_argb[3] = src_argb[3];
3710   }
3711 }
3712 
ARGBCopyAlphaRow_C(const uint8_t * src,uint8_t * dst,int width)3713 void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
3714   int i;
3715   for (i = 0; i < width - 1; i += 2) {
3716     dst[3] = src[3];
3717     dst[7] = src[7];
3718     dst += 8;
3719     src += 8;
3720   }
3721   if (width & 1) {
3722     dst[3] = src[3];
3723   }
3724 }
3725 
ARGBExtractAlphaRow_C(const uint8_t * src_argb,uint8_t * dst_a,int width)3726 void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {
3727   int i;
3728   for (i = 0; i < width - 1; i += 2) {
3729     dst_a[0] = src_argb[3];
3730     dst_a[1] = src_argb[7];
3731     dst_a += 2;
3732     src_argb += 8;
3733   }
3734   if (width & 1) {
3735     dst_a[0] = src_argb[3];
3736   }
3737 }
3738 
ARGBCopyYToAlphaRow_C(const uint8_t * src,uint8_t * dst,int width)3739 void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
3740   int i;
3741   for (i = 0; i < width - 1; i += 2) {
3742     dst[3] = src[0];
3743     dst[7] = src[1];
3744     dst += 8;
3745     src += 2;
3746   }
3747   if (width & 1) {
3748     dst[3] = src[0];
3749   }
3750 }
3751 
3752 // Maximum temporary width for wrappers to process at a time, in pixels.
3753 #define MAXTWIDTH 2048
3754 
3755 #if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \
3756     defined(HAS_I422TORGB565ROW_SSSE3)
3757 // row_win.cc has asm version, but GCC uses 2 step wrapper.
I422ToRGB565Row_SSSE3(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)3758 void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
3759                            const uint8_t* src_u,
3760                            const uint8_t* src_v,
3761                            uint8_t* dst_rgb565,
3762                            const struct YuvConstants* yuvconstants,
3763                            int width) {
3764   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3765   while (width > 0) {
3766     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3767     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
3768     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
3769     src_y += twidth;
3770     src_u += twidth / 2;
3771     src_v += twidth / 2;
3772     dst_rgb565 += twidth * 2;
3773     width -= twidth;
3774   }
3775 }
3776 #endif
3777 
3778 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
I422ToARGB1555Row_SSSE3(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)3779 void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
3780                              const uint8_t* src_u,
3781                              const uint8_t* src_v,
3782                              uint8_t* dst_argb1555,
3783                              const struct YuvConstants* yuvconstants,
3784                              int width) {
3785   // Row buffer for intermediate ARGB pixels.
3786   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3787   while (width > 0) {
3788     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3789     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
3790     ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
3791     src_y += twidth;
3792     src_u += twidth / 2;
3793     src_v += twidth / 2;
3794     dst_argb1555 += twidth * 2;
3795     width -= twidth;
3796   }
3797 }
3798 #endif
3799 
3800 #if defined(HAS_I422TOARGB4444ROW_SSSE3)
I422ToARGB4444Row_SSSE3(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)3801 void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
3802                              const uint8_t* src_u,
3803                              const uint8_t* src_v,
3804                              uint8_t* dst_argb4444,
3805                              const struct YuvConstants* yuvconstants,
3806                              int width) {
3807   // Row buffer for intermediate ARGB pixels.
3808   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3809   while (width > 0) {
3810     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3811     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
3812     ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
3813     src_y += twidth;
3814     src_u += twidth / 2;
3815     src_v += twidth / 2;
3816     dst_argb4444 += twidth * 2;
3817     width -= twidth;
3818   }
3819 }
3820 #endif
3821 
3822 #if defined(HAS_NV12TORGB565ROW_SSSE3)
NV12ToRGB565Row_SSSE3(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)3823 void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
3824                            const uint8_t* src_uv,
3825                            uint8_t* dst_rgb565,
3826                            const struct YuvConstants* yuvconstants,
3827                            int width) {
3828   // Row buffer for intermediate ARGB pixels.
3829   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3830   while (width > 0) {
3831     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3832     NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
3833     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
3834     src_y += twidth;
3835     src_uv += twidth;
3836     dst_rgb565 += twidth * 2;
3837     width -= twidth;
3838   }
3839 }
3840 #endif
3841 
3842 #if defined(HAS_NV12TORGB24ROW_SSSE3)
NV12ToRGB24Row_SSSE3(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3843 void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
3844                           const uint8_t* src_uv,
3845                           uint8_t* dst_rgb24,
3846                           const struct YuvConstants* yuvconstants,
3847                           int width) {
3848   // Row buffer for intermediate ARGB pixels.
3849   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3850   while (width > 0) {
3851     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3852     NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
3853     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3854     src_y += twidth;
3855     src_uv += twidth;
3856     dst_rgb24 += twidth * 3;
3857     width -= twidth;
3858   }
3859 }
3860 #endif
3861 
3862 #if defined(HAS_NV21TORGB24ROW_SSSE3)
NV21ToRGB24Row_SSSE3(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3863 void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
3864                           const uint8_t* src_vu,
3865                           uint8_t* dst_rgb24,
3866                           const struct YuvConstants* yuvconstants,
3867                           int width) {
3868   // Row buffer for intermediate ARGB pixels.
3869   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3870   while (width > 0) {
3871     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3872     NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);
3873     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3874     src_y += twidth;
3875     src_vu += twidth;
3876     dst_rgb24 += twidth * 3;
3877     width -= twidth;
3878   }
3879 }
3880 #endif
3881 
3882 #if defined(HAS_NV12TORGB24ROW_AVX2)
NV12ToRGB24Row_AVX2(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3883 void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
3884                          const uint8_t* src_uv,
3885                          uint8_t* dst_rgb24,
3886                          const struct YuvConstants* yuvconstants,
3887                          int width) {
3888   // Row buffer for intermediate ARGB pixels.
3889   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3890   while (width > 0) {
3891     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3892     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
3893 #if defined(HAS_ARGBTORGB24ROW_AVX2)
3894     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
3895 #else
3896     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3897 #endif
3898     src_y += twidth;
3899     src_uv += twidth;
3900     dst_rgb24 += twidth * 3;
3901     width -= twidth;
3902   }
3903 }
3904 #endif
3905 
3906 #if defined(HAS_NV21TORGB24ROW_AVX2)
NV21ToRGB24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)3907 void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
3908                          const uint8_t* src_vu,
3909                          uint8_t* dst_rgb24,
3910                          const struct YuvConstants* yuvconstants,
3911                          int width) {
3912   // Row buffer for intermediate ARGB pixels.
3913   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3914   while (width > 0) {
3915     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3916     NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
3917 #if defined(HAS_ARGBTORGB24ROW_AVX2)
3918     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
3919 #else
3920     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
3921 #endif
3922     src_y += twidth;
3923     src_vu += twidth;
3924     dst_rgb24 += twidth * 3;
3925     width -= twidth;
3926   }
3927 }
3928 #endif
3929 
3930 #if defined(HAS_I422TORGB565ROW_AVX2)
I422ToRGB565Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)3931 void I422ToRGB565Row_AVX2(const uint8_t* src_y,
3932                           const uint8_t* src_u,
3933                           const uint8_t* src_v,
3934                           uint8_t* dst_rgb565,
3935                           const struct YuvConstants* yuvconstants,
3936                           int width) {
3937   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3938   while (width > 0) {
3939     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3940     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
3941 #if defined(HAS_ARGBTORGB565ROW_AVX2)
3942     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
3943 #else
3944     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
3945 #endif
3946     src_y += twidth;
3947     src_u += twidth / 2;
3948     src_v += twidth / 2;
3949     dst_rgb565 += twidth * 2;
3950     width -= twidth;
3951   }
3952 }
3953 #endif
3954 
3955 #if defined(HAS_I422TOARGB1555ROW_AVX2)
I422ToARGB1555Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)3956 void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
3957                             const uint8_t* src_u,
3958                             const uint8_t* src_v,
3959                             uint8_t* dst_argb1555,
3960                             const struct YuvConstants* yuvconstants,
3961                             int width) {
3962   // Row buffer for intermediate ARGB pixels.
3963   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3964   while (width > 0) {
3965     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3966     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
3967 #if defined(HAS_ARGBTOARGB1555ROW_AVX2)
3968     ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
3969 #else
3970     ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
3971 #endif
3972     src_y += twidth;
3973     src_u += twidth / 2;
3974     src_v += twidth / 2;
3975     dst_argb1555 += twidth * 2;
3976     width -= twidth;
3977   }
3978 }
3979 #endif
3980 
3981 #if defined(HAS_I422TOARGB4444ROW_AVX2)
I422ToARGB4444Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)3982 void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
3983                             const uint8_t* src_u,
3984                             const uint8_t* src_v,
3985                             uint8_t* dst_argb4444,
3986                             const struct YuvConstants* yuvconstants,
3987                             int width) {
3988   // Row buffer for intermediate ARGB pixels.
3989   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
3990   while (width > 0) {
3991     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
3992     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
3993 #if defined(HAS_ARGBTOARGB4444ROW_AVX2)
3994     ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
3995 #else
3996     ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
3997 #endif
3998     src_y += twidth;
3999     src_u += twidth / 2;
4000     src_v += twidth / 2;
4001     dst_argb4444 += twidth * 2;
4002     width -= twidth;
4003   }
4004 }
4005 #endif
4006 
4007 #if defined(HAS_I422TORGB24ROW_AVX2)
I422ToRGB24Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)4008 void I422ToRGB24Row_AVX2(const uint8_t* src_y,
4009                          const uint8_t* src_u,
4010                          const uint8_t* src_v,
4011                          uint8_t* dst_rgb24,
4012                          const struct YuvConstants* yuvconstants,
4013                          int width) {
4014   // Row buffer for intermediate ARGB pixels.
4015   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4016   while (width > 0) {
4017     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4018     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
4019 #if defined(HAS_ARGBTORGB24ROW_AVX2)
4020     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
4021 #else
4022     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
4023 #endif
4024     src_y += twidth;
4025     src_u += twidth / 2;
4026     src_v += twidth / 2;
4027     dst_rgb24 += twidth * 3;
4028     width -= twidth;
4029   }
4030 }
4031 #endif
4032 
4033 #if defined(HAS_NV12TORGB565ROW_AVX2)
NV12ToRGB565Row_AVX2(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)4034 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
4035                           const uint8_t* src_uv,
4036                           uint8_t* dst_rgb565,
4037                           const struct YuvConstants* yuvconstants,
4038                           int width) {
4039   // Row buffer for intermediate ARGB pixels.
4040   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4041   while (width > 0) {
4042     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4043     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
4044 #if defined(HAS_ARGBTORGB565ROW_AVX2)
4045     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
4046 #else
4047     ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
4048 #endif
4049     src_y += twidth;
4050     src_uv += twidth;
4051     dst_rgb565 += twidth * 2;
4052     width -= twidth;
4053   }
4054 }
4055 #endif
4056 
4057 #ifdef HAS_RGB24TOYJROW_AVX2
4058 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
RGB24ToYJRow_AVX2(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)4059 void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
4060   // Row buffer for intermediate ARGB pixels.
4061   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4062   while (width > 0) {
4063     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4064     RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
4065     ARGBToYJRow_AVX2(row, dst_yj, twidth);
4066     src_rgb24 += twidth * 3;
4067     dst_yj += twidth;
4068     width -= twidth;
4069   }
4070 }
4071 #endif  // HAS_RGB24TOYJROW_AVX2
4072 
4073 #ifdef HAS_RAWTOYJROW_AVX2
4074 // Convert 16 RAW pixels (64 bytes) to 16 YJ values.
RAWToYJRow_AVX2(const uint8_t * src_raw,uint8_t * dst_yj,int width)4075 void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
4076   // Row buffer for intermediate ARGB pixels.
4077   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4078   while (width > 0) {
4079     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4080     RAWToARGBRow_SSSE3(src_raw, row, twidth);
4081     ARGBToYJRow_AVX2(row, dst_yj, twidth);
4082     src_raw += twidth * 3;
4083     dst_yj += twidth;
4084     width -= twidth;
4085   }
4086 }
4087 #endif  // HAS_RAWTOYJROW_AVX2
4088 
4089 #ifdef HAS_RGB24TOYJROW_SSSE3
4090 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
RGB24ToYJRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)4091 void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
4092   // Row buffer for intermediate ARGB pixels.
4093   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4094   while (width > 0) {
4095     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4096     RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
4097     ARGBToYJRow_SSSE3(row, dst_yj, twidth);
4098     src_rgb24 += twidth * 3;
4099     dst_yj += twidth;
4100     width -= twidth;
4101   }
4102 }
4103 #endif  // HAS_RGB24TOYJROW_SSSE3
4104 
4105 #ifdef HAS_RAWTOYJROW_SSSE3
4106 // Convert 16 RAW pixels (64 bytes) to 16 YJ values.
RAWToYJRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_yj,int width)4107 void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
4108   // Row buffer for intermediate ARGB pixels.
4109   SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
4110   while (width > 0) {
4111     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4112     RAWToARGBRow_SSSE3(src_raw, row, twidth);
4113     ARGBToYJRow_SSSE3(row, dst_yj, twidth);
4114     src_raw += twidth * 3;
4115     dst_yj += twidth;
4116     width -= twidth;
4117   }
4118 }
4119 #endif  // HAS_RAWTOYJROW_SSSE3
4120 
4121 #ifdef HAS_INTERPOLATEROW_16TO8_AVX2
InterpolateRow_16To8_AVX2(uint8_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int scale,int width,int source_y_fraction)4122 void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
4123                                const uint16_t* src_ptr,
4124                                ptrdiff_t src_stride,
4125                                int scale,
4126                                int width,
4127                                int source_y_fraction) {
4128   // Row buffer for intermediate 16 bit pixels.
4129   SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
4130   while (width > 0) {
4131     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
4132     InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
4133     Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
4134     src_ptr += twidth;
4135     dst_ptr += twidth;
4136     width -= twidth;
4137   }
4138 }
4139 #endif  // HAS_INTERPOLATEROW_16TO8_AVX2
4140 
ScaleSumSamples_C(const float * src,float * dst,float scale,int width)4141 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
4142   float fsum = 0.f;
4143   int i;
4144   for (i = 0; i < width; ++i) {
4145     float v = *src++;
4146     fsum += v * v;
4147     *dst++ = v * scale;
4148   }
4149   return fsum;
4150 }
4151 
ScaleMaxSamples_C(const float * src,float * dst,float scale,int width)4152 float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
4153   float fmax = 0.f;
4154   int i;
4155   for (i = 0; i < width; ++i) {
4156     float v = *src++;
4157     float vs = v * scale;
4158     fmax = (v > fmax) ? v : fmax;
4159     *dst++ = vs;
4160   }
4161   return fmax;
4162 }
4163 
ScaleSamples_C(const float * src,float * dst,float scale,int width)4164 void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
4165   int i;
4166   for (i = 0; i < width; ++i) {
4167     *dst++ = *src++ * scale;
4168   }
4169 }
4170 
GaussRow_C(const uint32_t * src,uint16_t * dst,int width)4171 void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
4172   int i;
4173   for (i = 0; i < width; ++i) {
4174     *dst++ =
4175         (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
4176     ++src;
4177   }
4178 }
4179 
4180 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_C(const uint16_t * src0,const uint16_t * src1,const uint16_t * src2,const uint16_t * src3,const uint16_t * src4,uint32_t * dst,int width)4181 void GaussCol_C(const uint16_t* src0,
4182                 const uint16_t* src1,
4183                 const uint16_t* src2,
4184                 const uint16_t* src3,
4185                 const uint16_t* src4,
4186                 uint32_t* dst,
4187                 int width) {
4188   int i;
4189   for (i = 0; i < width; ++i) {
4190     *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
4191   }
4192 }
4193 
GaussRow_F32_C(const float * src,float * dst,int width)4194 void GaussRow_F32_C(const float* src, float* dst, int width) {
4195   int i;
4196   for (i = 0; i < width; ++i) {
4197     *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
4198              (1.0f / 256.0f);
4199     ++src;
4200   }
4201 }
4202 
4203 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_F32_C(const float * src0,const float * src1,const float * src2,const float * src3,const float * src4,float * dst,int width)4204 void GaussCol_F32_C(const float* src0,
4205                     const float* src1,
4206                     const float* src2,
4207                     const float* src3,
4208                     const float* src4,
4209                     float* dst,
4210                     int width) {
4211   int i;
4212   for (i = 0; i < width; ++i) {
4213     *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
4214   }
4215 }
4216 
4217 // Convert biplanar NV21 to packed YUV24
NV21ToYUV24Row_C(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)4218 void NV21ToYUV24Row_C(const uint8_t* src_y,
4219                       const uint8_t* src_vu,
4220                       uint8_t* dst_yuv24,
4221                       int width) {
4222   int x;
4223   for (x = 0; x < width - 1; x += 2) {
4224     dst_yuv24[0] = src_vu[0];  // V
4225     dst_yuv24[1] = src_vu[1];  // U
4226     dst_yuv24[2] = src_y[0];   // Y0
4227     dst_yuv24[3] = src_vu[0];  // V
4228     dst_yuv24[4] = src_vu[1];  // U
4229     dst_yuv24[5] = src_y[1];   // Y1
4230     src_y += 2;
4231     src_vu += 2;
4232     dst_yuv24 += 6;  // Advance 2 pixels.
4233   }
4234   if (width & 1) {
4235     dst_yuv24[0] = src_vu[0];  // V
4236     dst_yuv24[1] = src_vu[1];  // U
4237     dst_yuv24[2] = src_y[0];   // Y0
4238   }
4239 }
4240 
4241 // Filter 2 rows of AYUV UV's (444) into UV (420).
4242 // AYUV is VUYA in memory.  UV for NV12 is UV order in memory.
AYUVToUVRow_C(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_uv,int width)4243 void AYUVToUVRow_C(const uint8_t* src_ayuv,
4244                    int src_stride_ayuv,
4245                    uint8_t* dst_uv,
4246                    int width) {
4247   // Output a row of UV values, filtering 2x2 rows of AYUV.
4248   int x;
4249   for (x = 0; x < width - 1; x += 2) {
4250     dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
4251                  src_ayuv[src_stride_ayuv + 5] + 2) >>
4252                 2;
4253     dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
4254                  src_ayuv[src_stride_ayuv + 4] + 2) >>
4255                 2;
4256     src_ayuv += 8;
4257     dst_uv += 2;
4258   }
4259   if (width & 1) {
4260     dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
4261     dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
4262   }
4263 }
4264 
4265 // Filter 2 rows of AYUV UV's (444) into VU (420).
AYUVToVURow_C(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_vu,int width)4266 void AYUVToVURow_C(const uint8_t* src_ayuv,
4267                    int src_stride_ayuv,
4268                    uint8_t* dst_vu,
4269                    int width) {
4270   // Output a row of VU values, filtering 2x2 rows of AYUV.
4271   int x;
4272   for (x = 0; x < width - 1; x += 2) {
4273     dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
4274                  src_ayuv[src_stride_ayuv + 4] + 2) >>
4275                 2;
4276     dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
4277                  src_ayuv[src_stride_ayuv + 5] + 2) >>
4278                 2;
4279     src_ayuv += 8;
4280     dst_vu += 2;
4281   }
4282   if (width & 1) {
4283     dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
4284     dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
4285   }
4286 }
4287 
4288 // Copy row of AYUV Y's into Y
AYUVToYRow_C(const uint8_t * src_ayuv,uint8_t * dst_y,int width)4289 void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
4290   // Output a row of Y values.
4291   int x;
4292   for (x = 0; x < width; ++x) {
4293     dst_y[x] = src_ayuv[2];  // v,u,y,a
4294     src_ayuv += 4;
4295   }
4296 }
4297 
4298 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_C(const uint8_t * src_uv,uint8_t * dst_vu,int width)4299 void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
4300   int x;
4301   for (x = 0; x < width; ++x) {
4302     uint8_t u = src_uv[0];
4303     uint8_t v = src_uv[1];
4304     dst_vu[0] = v;
4305     dst_vu[1] = u;
4306     src_uv += 2;
4307     dst_vu += 2;
4308   }
4309 }
4310 
HalfMergeUVRow_C(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)4311 void HalfMergeUVRow_C(const uint8_t* src_u,
4312                       int src_stride_u,
4313                       const uint8_t* src_v,
4314                       int src_stride_v,
4315                       uint8_t* dst_uv,
4316                       int width) {
4317   int x;
4318   for (x = 0; x < width - 1; x += 2) {
4319     dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
4320                  src_u[src_stride_u + 1] + 2) >>
4321                 2;
4322     dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
4323                  src_v[src_stride_v + 1] + 2) >>
4324                 2;
4325     src_u += 2;
4326     src_v += 2;
4327     dst_uv += 2;
4328   }
4329   if (width & 1) {
4330     dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
4331     dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
4332   }
4333 }
4334 
4335 #ifdef __cplusplus
4336 }  // extern "C"
4337 }  // namespace libyuv
4338 #endif
4339