1 /*
2 * jcsample-neon.c - downsampling (Arm Neon)
3 *
4 * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 #define JPEG_INTERNALS
24 #include "../../jinclude.h"
25 #include "../../jpeglib.h"
26 #include "../../jsimd.h"
27 #include "../../jdct.h"
28 #include "../../jsimddct.h"
29 #include "../jsimd.h"
30 #include "align.h"
31
32 #include <arm_neon.h>
33
34
35 ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
36 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
37 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
38 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
39 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
40 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
41 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
42 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
43 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
44 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
45 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
46 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
47 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
48 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
49 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
50 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
51 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
52 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
53 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
54 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
55 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
56 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
57 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
58 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
59 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
60 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
61 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
62 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
63 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
64 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
65 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
66 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
67 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
68 };
69
70
71 /* Downsample pixel values of a single component.
72 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
73 * without smoothing.
74 */
75
jsimd_h2v1_downsample_neon(JDIMENSION image_width,int max_v_samp_factor,JDIMENSION v_samp_factor,JDIMENSION width_in_blocks,JSAMPARRAY input_data,JSAMPARRAY output_data)76 void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
77 JDIMENSION v_samp_factor,
78 JDIMENSION width_in_blocks,
79 JSAMPARRAY input_data, JSAMPARRAY output_data)
80 {
81 JSAMPROW inptr, outptr;
82 /* Load expansion mask to pad remaining elements of last DCT block. */
83 const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
84 const uint8x16_t expand_mask =
85 vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
86 /* Load bias pattern (alternating every pixel.) */
87 /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
88 const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
89 unsigned i, outrow;
90
91 for (outrow = 0; outrow < v_samp_factor; outrow++) {
92 outptr = output_data[outrow];
93 inptr = input_data[outrow];
94
95 /* Downsample all but the last DCT block of pixels. */
96 for (i = 0; i < width_in_blocks - 1; i++) {
97 uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
98 /* Add adjacent pixel values, widen to 16-bit, and add bias. */
99 uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
100 /* Divide total by 2 and narrow to 8-bit. */
101 uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
102 /* Store samples to memory. */
103 vst1_u8(outptr + i * DCTSIZE, samples_u8);
104 }
105
106 /* Load pixels in last DCT block into a table. */
107 uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
108 #if defined(__aarch64__) || defined(_M_ARM64)
109 /* Pad the empty elements with the value of the last pixel. */
110 pixels = vqtbl1q_u8(pixels, expand_mask);
111 #else
112 uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
113 pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
114 vtbl2_u8(table, vget_high_u8(expand_mask)));
115 #endif
116 /* Add adjacent pixel values, widen to 16-bit, and add bias. */
117 uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
118 /* Divide total by 2, narrow to 8-bit, and store. */
119 uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
120 vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
121 }
122 }
123
124
125 /* Downsample pixel values of a single component.
126 * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
127 * without smoothing.
128 */
129
jsimd_h2v2_downsample_neon(JDIMENSION image_width,int max_v_samp_factor,JDIMENSION v_samp_factor,JDIMENSION width_in_blocks,JSAMPARRAY input_data,JSAMPARRAY output_data)130 void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
131 JDIMENSION v_samp_factor,
132 JDIMENSION width_in_blocks,
133 JSAMPARRAY input_data, JSAMPARRAY output_data)
134 {
135 JSAMPROW inptr0, inptr1, outptr;
136 /* Load expansion mask to pad remaining elements of last DCT block. */
137 const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
138 const uint8x16_t expand_mask =
139 vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
140 /* Load bias pattern (alternating every pixel.) */
141 /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
142 const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
143 unsigned i, outrow;
144
145 for (outrow = 0; outrow < v_samp_factor; outrow++) {
146 outptr = output_data[outrow];
147 inptr0 = input_data[outrow];
148 inptr1 = input_data[outrow + 1];
149
150 /* Downsample all but the last DCT block of pixels. */
151 for (i = 0; i < width_in_blocks - 1; i++) {
152 uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
153 uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
154 /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
155 uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
156 /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
157 */
158 samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
159 /* Divide total by 4 and narrow to 8-bit. */
160 uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
161 /* Store samples to memory and increment pointers. */
162 vst1_u8(outptr + i * DCTSIZE, samples_u8);
163 }
164
165 /* Load pixels in last DCT block into a table. */
166 uint8x16_t pixels_r0 =
167 vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
168 uint8x16_t pixels_r1 =
169 vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
170 #if defined(__aarch64__) || defined(_M_ARM64)
171 /* Pad the empty elements with the value of the last pixel. */
172 pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
173 pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
174 #else
175 uint8x8x2_t table_r0 =
176 { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
177 uint8x8x2_t table_r1 =
178 { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
179 pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
180 vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
181 pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
182 vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
183 #endif
184 /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
185 uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
186 /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
187 samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
188 /* Divide total by 4, narrow to 8-bit, and store. */
189 uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
190 vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
191 }
192 }
193