• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "ultrahdr/gainmapmath.h"
18 
19 #include <arm_neon.h>
20 #include <cassert>
21 
22 namespace ultrahdr {
23 
24 // Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
25 // by one error compared to the scalar floating-point implementation.
26 
27 // Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients.
28 // Pack them into a single 128-bit vector as follows, zeroing the remaining elements:
29 // {Y1, Y2, U1, U2, V1, V2, 0, 0}
30 
31 // Yuv Bt709 -> Yuv Bt601
32 // Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V)
33 // U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V)
34 // V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V)
35 __attribute__((aligned(16)))
36 const int16_t kYuv709To601_coeffs_neon[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0};
37 
38 // Yuv Bt709 -> Yuv Bt2100
39 // Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V)
40 // U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V)
41 // V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V)
42 __attribute__((aligned(16)))
43 const int16_t kYuv709To2100_coeffs_neon[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0};
44 
45 // Yuv Bt601 -> Yuv Bt709
46 // Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V),
47 // U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V),
48 // V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V);
49 __attribute__((aligned(16)))
50 const int16_t kYuv601To709_coeffs_neon[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0};
51 
52 // Yuv Bt601 -> Yuv Bt2100
53 // Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V)
54 // U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V)
55 // V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V)
56 __attribute__((aligned(16)))
57 const int16_t kYuv601To2100_coeffs_neon[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0};
58 
59 // Yuv Bt2100 -> Yuv Bt709
60 // Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V)
61 // U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V)
62 // V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V)
63 __attribute__((aligned(16)))
64 const int16_t kYuv2100To709_coeffs_neon[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0};
65 
66 // Yuv Bt2100 -> Yuv Bt601
67 // Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V)
68 // U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V)
69 // V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V)
70 __attribute__((aligned(16)))
71 const int16_t kYuv2100To601_coeffs_neon[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0};
72 
yConversion_neon(uint8x8_t y,int16x8_t u,int16x8_t v,int16x8_t coeffs)73 static inline int16x8_t yConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
74   int32x4_t lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 0);
75   int32x4_t hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 0);
76   lo = vmlal_laneq_s16(lo, vget_low_s16(v), coeffs, 1);
77   hi = vmlal_laneq_s16(hi, vget_high_s16(v), coeffs, 1);
78 
79   // Descale result to account for coefficients being scaled by 2^14.
80   uint16x8_t y_output =
81       vreinterpretq_u16_s16(vcombine_s16(vqrshrn_n_s32(lo, 14), vqrshrn_n_s32(hi, 14)));
82   return vreinterpretq_s16_u16(vaddw_u8(y_output, y));
83 }
84 
uConversion_neon(int16x8_t u,int16x8_t v,int16x8_t coeffs)85 static inline int16x8_t uConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) {
86   int32x4_t u_lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 2);
87   int32x4_t u_hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 2);
88   u_lo = vmlal_laneq_s16(u_lo, vget_low_s16(v), coeffs, 3);
89   u_hi = vmlal_laneq_s16(u_hi, vget_high_s16(v), coeffs, 3);
90 
91   // Descale result to account for coefficients being scaled by 2^14.
92   const int16x8_t u_output = vcombine_s16(vqrshrn_n_s32(u_lo, 14), vqrshrn_n_s32(u_hi, 14));
93   return u_output;
94 }
95 
vConversion_neon(int16x8_t u,int16x8_t v,int16x8_t coeffs)96 static inline int16x8_t vConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) {
97   int32x4_t v_lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 4);
98   int32x4_t v_hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 4);
99   v_lo = vmlal_laneq_s16(v_lo, vget_low_s16(v), coeffs, 5);
100   v_hi = vmlal_laneq_s16(v_hi, vget_high_s16(v), coeffs, 5);
101 
102   // Descale result to account for coefficients being scaled by 2^14.
103   const int16x8_t v_output = vcombine_s16(vqrshrn_n_s32(v_lo, 14), vqrshrn_n_s32(v_hi, 14));
104   return v_output;
105 }
106 
yuvConversion_neon(uint8x8_t y,int16x8_t u,int16x8_t v,int16x8_t coeffs)107 int16x8x3_t yuvConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
108   const int16x8_t y_output = yConversion_neon(y, u, v, coeffs);
109   const int16x8_t u_output = uConversion_neon(u, v, coeffs);
110   const int16x8_t v_output = vConversion_neon(u, v, coeffs);
111   return {y_output, u_output, v_output};
112 }
113 
transformYuv420_neon(jr_uncompressed_ptr image,const int16_t * coeffs_ptr)114 void transformYuv420_neon(jr_uncompressed_ptr image, const int16_t* coeffs_ptr) {
115   // Implementation assumes image buffer is multiple of 16.
116   assert(image->width % 16 == 0);
117   uint8_t* y0_ptr = static_cast<uint8_t*>(image->data);
118   uint8_t* y1_ptr = y0_ptr + image->luma_stride;
119   uint8_t* u_ptr = static_cast<uint8_t*>(image->chroma_data);
120   uint8_t* v_ptr = u_ptr + image->chroma_stride * (image->height / 2);
121 
122   const int16x8_t coeffs = vld1q_s16(coeffs_ptr);
123   const uint16x8_t uv_bias = vreinterpretq_u16_s16(vdupq_n_s16(-128));
124   size_t h = 0;
125   do {
126     size_t w = 0;
127     do {
128       uint8x16_t y0 = vld1q_u8(y0_ptr + w * 2);
129       uint8x16_t y1 = vld1q_u8(y1_ptr + w * 2);
130       uint8x8_t u = vld1_u8(u_ptr + w);
131       uint8x8_t v = vld1_u8(v_ptr + w);
132 
133       // 128 bias for UV given we are using libjpeg; see:
134       // https://github.com/kornelski/libjpeg/blob/master/structure.doc
135       int16x8_t u_wide_s16 = vreinterpretq_s16_u16(vaddw_u8(uv_bias, u));  // -128 + u
136       int16x8_t v_wide_s16 = vreinterpretq_s16_u16(vaddw_u8(uv_bias, v));  // -128 + v
137 
138       const int16x8_t u_wide_lo = vzip1q_s16(u_wide_s16, u_wide_s16);
139       const int16x8_t u_wide_hi = vzip2q_s16(u_wide_s16, u_wide_s16);
140       const int16x8_t v_wide_lo = vzip1q_s16(v_wide_s16, v_wide_s16);
141       const int16x8_t v_wide_hi = vzip2q_s16(v_wide_s16, v_wide_s16);
142 
143       const int16x8_t y0_lo = yConversion_neon(vget_low_u8(y0), u_wide_lo, v_wide_lo, coeffs);
144       const int16x8_t y0_hi = yConversion_neon(vget_high_u8(y0), u_wide_hi, v_wide_hi, coeffs);
145       const int16x8_t y1_lo = yConversion_neon(vget_low_u8(y1), u_wide_lo, v_wide_lo, coeffs);
146       const int16x8_t y1_hi = yConversion_neon(vget_high_u8(y1), u_wide_hi, v_wide_hi, coeffs);
147 
148       const int16x8_t new_u = uConversion_neon(u_wide_s16, v_wide_s16, coeffs);
149       const int16x8_t new_v = vConversion_neon(u_wide_s16, v_wide_s16, coeffs);
150 
151       // Narrow from 16-bit to 8-bit with saturation.
152       const uint8x16_t y0_output = vcombine_u8(vqmovun_s16(y0_lo), vqmovun_s16(y0_hi));
153       const uint8x16_t y1_output = vcombine_u8(vqmovun_s16(y1_lo), vqmovun_s16(y1_hi));
154       const uint8x8_t u_output = vqmovun_s16(vaddq_s16(new_u, vdupq_n_s16(128)));
155       const uint8x8_t v_output = vqmovun_s16(vaddq_s16(new_v, vdupq_n_s16(128)));
156 
157       vst1q_u8(y0_ptr + w * 2, y0_output);
158       vst1q_u8(y1_ptr + w * 2, y1_output);
159       vst1_u8(u_ptr + w, u_output);
160       vst1_u8(v_ptr + w, v_output);
161 
162       w += 8;
163     } while (w < image->width / 2);
164     y0_ptr += image->luma_stride * 2;
165     y1_ptr += image->luma_stride * 2;
166     u_ptr += image->chroma_stride;
167     v_ptr += image->chroma_stride;
168   } while (++h < image->height / 2);
169 }
170 
convertYuv_neon(jr_uncompressed_ptr image,ultrahdr_color_gamut src_encoding,ultrahdr_color_gamut dst_encoding)171 status_t convertYuv_neon(jr_uncompressed_ptr image, ultrahdr_color_gamut src_encoding,
172                          ultrahdr_color_gamut dst_encoding) {
173   if (image == nullptr) {
174     return ERROR_JPEGR_BAD_PTR;
175   }
176   if (src_encoding == ULTRAHDR_COLORGAMUT_UNSPECIFIED ||
177       dst_encoding == ULTRAHDR_COLORGAMUT_UNSPECIFIED) {
178     return ERROR_JPEGR_INVALID_COLORGAMUT;
179   }
180 
181   const int16_t* coeffs = nullptr;
182   switch (src_encoding) {
183     case ULTRAHDR_COLORGAMUT_BT709:
184       switch (dst_encoding) {
185         case ULTRAHDR_COLORGAMUT_BT709:
186           return JPEGR_NO_ERROR;
187         case ULTRAHDR_COLORGAMUT_P3:
188           coeffs = kYuv709To601_coeffs_neon;
189           break;
190         case ULTRAHDR_COLORGAMUT_BT2100:
191           coeffs = kYuv709To2100_coeffs_neon;
192           break;
193         default:
194           // Should be impossible to hit after input validation
195           return ERROR_JPEGR_INVALID_COLORGAMUT;
196       }
197       break;
198     case ULTRAHDR_COLORGAMUT_P3:
199       switch (dst_encoding) {
200         case ULTRAHDR_COLORGAMUT_BT709:
201           coeffs = kYuv601To709_coeffs_neon;
202           break;
203         case ULTRAHDR_COLORGAMUT_P3:
204           return JPEGR_NO_ERROR;
205         case ULTRAHDR_COLORGAMUT_BT2100:
206           coeffs = kYuv601To2100_coeffs_neon;
207           break;
208         default:
209           // Should be impossible to hit after input validation
210           return ERROR_JPEGR_INVALID_COLORGAMUT;
211       }
212       break;
213     case ULTRAHDR_COLORGAMUT_BT2100:
214       switch (dst_encoding) {
215         case ULTRAHDR_COLORGAMUT_BT709:
216           coeffs = kYuv2100To709_coeffs_neon;
217           break;
218         case ULTRAHDR_COLORGAMUT_P3:
219           coeffs = kYuv2100To601_coeffs_neon;
220           break;
221         case ULTRAHDR_COLORGAMUT_BT2100:
222           return JPEGR_NO_ERROR;
223         default:
224           // Should be impossible to hit after input validation
225           return ERROR_JPEGR_INVALID_COLORGAMUT;
226       }
227       break;
228     default:
229       // Should be impossible to hit after input validation
230       return ERROR_JPEGR_INVALID_COLORGAMUT;
231   }
232 
233   if (coeffs == nullptr) {
234     // Should be impossible to hit after input validation
235     return ERROR_JPEGR_INVALID_COLORGAMUT;
236   }
237 
238   transformYuv420_neon(image, coeffs);
239   return JPEGR_NO_ERROR;
240 }
241 
242 }  // namespace ultrahdr
243