• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1Index: jdmarker.c
2===================================================================
3--- jdmarker.c	(revision 829)
4+++ jdmarker.c	(working copy)
5@@ -910,7 +910,7 @@
6   }
7
8   if (cinfo->marker->discarded_bytes != 0) {
9-    WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
10+    TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
11     cinfo->marker->discarded_bytes = 0;
12   }
13
14@@ -944,7 +944,144 @@
15   return TRUE;
16 }
17
18+#ifdef MOTION_JPEG_SUPPORTED
19
20+/* The default Huffman tables used by motion JPEG frames. When a motion JPEG
21+ * frame does not have DHT tables, we should use the huffman tables suggested by
22+ * the JPEG standard. Each of these tables represents a member of the JHUFF_TBLS
23+ * struct so we can just copy it to the according JHUFF_TBLS member.
24+ */
25+/* DC table 0 */
26+LOCAL(const unsigned char) mjpg_dc0_bits[] = {
27+  0x00, 0x01, 0x05, 0x01, 0x01, 0x01, 0x01, 0x01,
28+  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
29+};
30+
31+LOCAL(const unsigned char) mjpg_dc0_huffval[] = {
32+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
33+  0x08, 0x09, 0x0A, 0x0B
34+};
35+
36+/* DC table 1 */
37+LOCAL(const unsigned char) mjpg_dc1_bits[] = {
38+  0x00, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
39+  0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00
40+};
41+
42+LOCAL(const unsigned char) mjpg_dc1_huffval[] = {
43+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
44+  0x08, 0x09, 0x0A, 0x0B
45+};
46+
47+/* AC table 0 */
48+LOCAL(const unsigned char) mjpg_ac0_bits[] = {
49+  0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03,
50+  0x05, 0x05, 0x04, 0x04, 0x00, 0x00, 0x01, 0x7D
51+};
52+
53+LOCAL(const unsigned char) mjpg_ac0_huffval[] = {
54+  0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
55+  0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
56+  0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08,
57+  0x23, 0x42, 0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0,
58+  0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16,
59+  0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28,
60+  0x29, 0x2A, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
61+  0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
62+  0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
63+  0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
64+  0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
65+  0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
66+  0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
67+  0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
68+  0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6,
69+  0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, 0xC4, 0xC5,
70+  0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4,
71+  0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2,
72+  0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA,
73+  0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
74+  0xF9, 0xFA
75+};
76+
77+/* AC table 1 */
78+LOCAL(const unsigned char) mjpg_ac1_bits[] = {
79+  0x00, 0x02, 0x01, 0x02, 0x04, 0x04, 0x03, 0x04,
80+  0x07, 0x05, 0x04, 0x04, 0x00, 0x01, 0x02, 0x77
81+};
82+
83+LOCAL(const unsigned char) mjpg_ac1_huffval[] = {
84+  0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
85+  0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
86+  0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
87+  0xA1, 0xB1, 0xC1, 0x09, 0x23, 0x33, 0x52, 0xF0,
88+  0x15, 0x62, 0x72, 0xD1, 0x0A, 0x16, 0x24, 0x34,
89+  0xE1, 0x25, 0xF1, 0x17, 0x18, 0x19, 0x1A, 0x26,
90+  0x27, 0x28, 0x29, 0x2A, 0x35, 0x36, 0x37, 0x38,
91+  0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
92+  0x49, 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
93+  0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
94+  0x69, 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
95+  0x79, 0x7A, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
96+  0x88, 0x89, 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96,
97+  0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5,
98+  0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4,
99+  0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3,
100+  0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2,
101+  0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA,
102+  0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
103+  0xEA, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
104+  0xF9, 0xFA
105+};
106+
107+/* Loads the default Huffman tables used by motion JPEG frames. This function
108+ * just copies the huffman tables suggested in the JPEG standard when we have
109+ * not load them.
110+ */
111+LOCAL(void)
112+mjpg_load_huff_tables (j_decompress_ptr cinfo)
113+{
114+  JHUFF_TBL *htblptr;
115+
116+  if (! cinfo->dc_huff_tbl_ptrs[0]) {
117+    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
118+    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
119+    MEMCOPY(&htblptr->bits[1], mjpg_dc0_bits, SIZEOF(mjpg_dc0_bits));
120+    MEMCOPY(&htblptr->huffval[0], mjpg_dc0_huffval, SIZEOF(mjpg_dc0_huffval));
121+    cinfo->dc_huff_tbl_ptrs[0] = htblptr;
122+  }
123+
124+  if (! cinfo->dc_huff_tbl_ptrs[1]) {
125+    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
126+    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
127+    MEMCOPY(&htblptr->bits[1], mjpg_dc1_bits, SIZEOF(mjpg_dc1_bits));
128+    MEMCOPY(&htblptr->huffval[0], mjpg_dc1_huffval, SIZEOF(mjpg_dc1_huffval));
129+    cinfo->dc_huff_tbl_ptrs[1] = htblptr;
130+  }
131+
132+  if (! cinfo->ac_huff_tbl_ptrs[0]) {
133+    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
134+    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
135+    MEMCOPY(&htblptr->bits[1], mjpg_ac0_bits, SIZEOF(mjpg_ac0_bits));
136+    MEMCOPY(&htblptr->huffval[0], mjpg_ac0_huffval, SIZEOF(mjpg_ac0_huffval));
137+    cinfo->ac_huff_tbl_ptrs[0] = htblptr;
138+  }
139+
140+  if (! cinfo->ac_huff_tbl_ptrs[1]) {
141+    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
142+    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
143+    MEMCOPY(&htblptr->bits[1], mjpg_ac1_bits, SIZEOF(mjpg_ac1_bits));
144+    MEMCOPY(&htblptr->huffval[0], mjpg_ac1_huffval, SIZEOF(mjpg_ac1_huffval));
145+    cinfo->ac_huff_tbl_ptrs[1] = htblptr;
146+  }
147+}
148+
149+#else
150+
151+#define mjpg_load_huff_tables(cinfo)
152+
153+#endif /* MOTION_JPEG_SUPPORTED */
154+
155+
156 /*
157  * Read markers until SOS or EOI.
158  *
159@@ -1013,6 +1150,7 @@
160       break;
161
162     case M_SOS:
163+      mjpg_load_huff_tables(cinfo);
164       if (! get_sos(cinfo))
165 	return JPEG_SUSPENDED;
166       cinfo->unread_marker = 0;	/* processed the marker */
167Index: jmorecfg.h
168===================================================================
169--- jmorecfg.h	(revision 829)
170+++ jmorecfg.h	(working copy)
171@@ -153,14 +153,18 @@
172 /* INT16 must hold at least the values -32768..32767. */
173
174 #ifndef XMD_H			/* X11/xmd.h correctly defines INT16 */
175+#ifndef _BASETSD_H_		/* basetsd.h correctly defines INT32 */
176 typedef short INT16;
177 #endif
178+#endif
179
180 /* INT32 must hold at least signed 32-bit values. */
181
182 #ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
183+#ifndef _BASETSD_H_		/* basetsd.h correctly defines INT32 */
184 typedef long INT32;
185 #endif
186+#endif
187
188 /* Datatype used for image dimensions.  The JPEG standard only supports
189  * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
190@@ -210,11 +214,13 @@
191  * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
192  */
193
194+#ifndef FAR
195 #ifdef NEED_FAR_POINTERS
196 #define FAR  far
197 #else
198 #define FAR
199 #endif
200+#endif
201
202
203 /*
204Index: jpeglib.h
205===================================================================
206--- jpeglib.h	(revision 829)
207+++ jpeglib.h	(working copy)
208@@ -15,6 +15,10 @@
209 #ifndef JPEGLIB_H
210 #define JPEGLIB_H
211
212+/* Begin chromium edits */
213+#include "jpeglibmangler.h"
214+/* End chromium edits */
215+
216 /*
217  * First we include the configuration files that record how this
218  * installation of the JPEG library is set up.  jconfig.h can be
219Index: jpeglibmangler.h
220===================================================================
221--- jpeglibmangler.h	(revision 0)
222+++ jpeglibmangler.h	(revision 0)
223@@ -0,0 +1,113 @@
224+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
225+// Use of this source code is governed by a BSD-style license that can be
226+// found in the LICENSE file.
227+
228+#ifndef THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
229+#define THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
230+
231+// Mangle all externally visible function names so we can build our own libjpeg
232+// without system libraries trying to use it.
233+
234+#define jpeg_make_c_derived_tbl chromium_jpeg_make_c_derived_tbl
235+#define jpeg_gen_optimal_table chromium_jpeg_gen_optimal_table
236+#define jpeg_make_d_derived_tbl chromium_jpeg_make_d_derived_tbl
237+#define jpeg_fill_bit_buffer chromium_jpeg_fill_bit_buffer
238+#define jpeg_huff_decode chromium_jpeg_huff_decode
239+#define jpeg_fdct_islow chromium_jpeg_fdct_islow
240+#define jpeg_fdct_ifast chromium_jpeg_fdct_ifast
241+#define jpeg_fdct_float chromium_jpeg_fdct_float
242+#define jpeg_idct_islow chromium_jpeg_idct_islow
243+#define jpeg_idct_ifast chromium_jpeg_idct_ifast
244+#define jpeg_idct_float chromium_jpeg_idct_float
245+#define jpeg_idct_4x4 chromium_jpeg_idct_4x4
246+#define jpeg_idct_2x2 chromium_jpeg_idct_2x2
247+#define jpeg_idct_1x1 chromium_jpeg_idct_1x1
248+#define jinit_compress_master chromium_jinit_compress_master
249+#define jinit_c_master_control chromium_jinit_c_master_control
250+#define jinit_c_main_controller chromium_jinit_c_main_controller
251+#define jinit_c_prep_controller chromium_jinit_c_prep_controller
252+#define jinit_c_coef_controller chromium_jinit_c_coef_controller
253+#define jinit_color_converter chromium_jinit_color_converter
254+#define jinit_downsampler chromium_jinit_downsampler
255+#define jinit_forward_dct chromium_jinit_forward_dct
256+#define jinit_huff_encoder chromium_jinit_huff_encoder
257+#define jinit_phuff_encoder chromium_jinit_phuff_encoder
258+#define jinit_marker_writer chromium_jinit_marker_writer
259+#define jinit_master_decompress chromium_jinit_master_decompress
260+#define jinit_d_main_controller chromium_jinit_d_main_controller
261+#define jinit_d_coef_controller chromium_jinit_d_coef_controller
262+#define jinit_d_post_controller chromium_jinit_d_post_controller
263+#define jinit_input_controller chromium_jinit_input_controller
264+#define jinit_marker_reader chromium_jinit_marker_reader
265+#define jinit_huff_decoder chromium_jinit_huff_decoder
266+#define jinit_phuff_decoder chromium_jinit_phuff_decoder
267+#define jinit_inverse_dct chromium_jinit_inverse_dct
268+#define jinit_upsampler chromium_jinit_upsampler
269+#define jinit_color_deconverter chromium_jinit_color_deconverter
270+#define jinit_1pass_quantizer chromium_jinit_1pass_quantizer
271+#define jinit_2pass_quantizer chromium_jinit_2pass_quantizer
272+#define jinit_merged_upsampler chromium_jinit_merged_upsampler
273+#define jinit_memory_mgr chromium_jinit_memory_mgr
274+#define jdiv_round_up chromium_jdiv_round_up
275+#define jround_up chromium_jround_up
276+#define jcopy_sample_rows chromium_jcopy_sample_rows
277+#define jcopy_block_row chromium_jcopy_block_row
278+#define jzero_far chromium_jzero_far
279+#define jpeg_std_error chromium_jpeg_std_error
280+#define jpeg_CreateCompress chromium_jpeg_CreateCompress
281+#define jpeg_CreateDecompress chromium_jpeg_CreateDecompress
282+#define jpeg_destroy_compress chromium_jpeg_destroy_compress
283+#define jpeg_destroy_decompress chromium_jpeg_destroy_decompress
284+#define jpeg_stdio_dest chromium_jpeg_stdio_dest
285+#define jpeg_stdio_src chromium_jpeg_stdio_src
286+#define jpeg_set_defaults chromium_jpeg_set_defaults
287+#define jpeg_set_colorspace chromium_jpeg_set_colorspace
288+#define jpeg_default_colorspace chromium_jpeg_default_colorspace
289+#define jpeg_set_quality chromium_jpeg_set_quality
290+#define jpeg_set_linear_quality chromium_jpeg_set_linear_quality
291+#define jpeg_add_quant_table chromium_jpeg_add_quant_table
292+#define jpeg_quality_scaling chromium_jpeg_quality_scaling
293+#define jpeg_simple_progression chromium_jpeg_simple_progression
294+#define jpeg_suppress_tables chromium_jpeg_suppress_tables
295+#define jpeg_alloc_quant_table chromium_jpeg_alloc_quant_table
296+#define jpeg_alloc_huff_table chromium_jpeg_alloc_huff_table
297+#define jpeg_start_compress chromium_jpeg_start_compress
298+#define jpeg_write_scanlines chromium_jpeg_write_scanlines
299+#define jpeg_finish_compress chromium_jpeg_finish_compress
300+#define jpeg_write_raw_data chromium_jpeg_write_raw_data
301+#define jpeg_write_marker chromium_jpeg_write_marker
302+#define jpeg_write_m_header chromium_jpeg_write_m_header
303+#define jpeg_write_m_byte chromium_jpeg_write_m_byte
304+#define jpeg_write_tables chromium_jpeg_write_tables
305+#define jpeg_read_header chromium_jpeg_read_header
306+#define jpeg_start_decompress chromium_jpeg_start_decompress
307+#define jpeg_read_scanlines chromium_jpeg_read_scanlines
308+#define jpeg_finish_decompress chromium_jpeg_finish_decompress
309+#define jpeg_read_raw_data chromium_jpeg_read_raw_data
310+#define jpeg_has_multiple_scans chromium_jpeg_has_multiple_scans
311+#define jpeg_start_output chromium_jpeg_start_output
312+#define jpeg_finish_output chromium_jpeg_finish_output
313+#define jpeg_input_complete chromium_jpeg_input_complete
314+#define jpeg_new_colormap chromium_jpeg_new_colormap
315+#define jpeg_consume_input chromium_jpeg_consume_input
316+#define jpeg_calc_output_dimensions chromium_jpeg_calc_output_dimensions
317+#define jpeg_save_markers chromium_jpeg_save_markers
318+#define jpeg_set_marker_processor chromium_jpeg_set_marker_processor
319+#define jpeg_read_coefficients chromium_jpeg_read_coefficients
320+#define jpeg_write_coefficients chromium_jpeg_write_coefficients
321+#define jpeg_copy_critical_parameters chromium_jpeg_copy_critical_parameters
322+#define jpeg_abort_compress chromium_jpeg_abort_compress
323+#define jpeg_abort_decompress chromium_jpeg_abort_decompress
324+#define jpeg_abort chromium_jpeg_abort
325+#define jpeg_destroy chromium_jpeg_destroy
326+#define jpeg_resync_to_restart chromium_jpeg_resync_to_restart
327+#define jpeg_get_small chromium_jpeg_get_small
328+#define jpeg_free_small chromium_jpeg_free_small
329+#define jpeg_get_large chromium_jpeg_get_large
330+#define jpeg_free_large chromium_jpeg_free_large
331+#define jpeg_mem_available chromium_jpeg_mem_available
332+#define jpeg_open_backing_store chromium_jpeg_open_backing_store
333+#define jpeg_mem_init chromium_jpeg_mem_init
334+#define jpeg_mem_term chromium_jpeg_mem_term
335+
336+#endif  // THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
337Index: simd/jcgrass2-64.asm
338===================================================================
339--- simd/jcgrass2-64.asm	(revision 829)
340+++ simd/jcgrass2-64.asm	(working copy)
341@@ -30,7 +30,7 @@
342 	SECTION	SEG_CONST
343
344 	alignz	16
345-	global	EXTN(jconst_rgb_gray_convert_sse2)
346+	global	EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
347
348 EXTN(jconst_rgb_gray_convert_sse2):
349
350Index: simd/jiss2fst.asm
351===================================================================
352--- simd/jiss2fst.asm	(revision 829)
353+++ simd/jiss2fst.asm	(working copy)
354@@ -59,7 +59,7 @@
355 %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
356
357 	alignz	16
358-	global	EXTN(jconst_idct_ifast_sse2)
359+	global	EXTN(jconst_idct_ifast_sse2) PRIVATE
360
361 EXTN(jconst_idct_ifast_sse2):
362
363@@ -92,7 +92,7 @@
364 %define WK_NUM		2
365
366 	align	16
367-	global	EXTN(jsimd_idct_ifast_sse2)
368+	global	EXTN(jsimd_idct_ifast_sse2) PRIVATE
369
370 EXTN(jsimd_idct_ifast_sse2):
371 	push	ebp
372Index: simd/jcclrss2-64.asm
373===================================================================
374--- simd/jcclrss2-64.asm	(revision 829)
375+++ simd/jcclrss2-64.asm	(working copy)
376@@ -37,7 +37,7 @@
377
378 	align	16
379
380-	global	EXTN(jsimd_rgb_ycc_convert_sse2)
381+	global	EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
382
383 EXTN(jsimd_rgb_ycc_convert_sse2):
384 	push	rbp
385Index: simd/jiss2red-64.asm
386===================================================================
387--- simd/jiss2red-64.asm	(revision 829)
388+++ simd/jiss2red-64.asm	(working copy)
389@@ -73,7 +73,7 @@
390 	SECTION	SEG_CONST
391
392 	alignz	16
393-	global	EXTN(jconst_idct_red_sse2)
394+	global	EXTN(jconst_idct_red_sse2) PRIVATE
395
396 EXTN(jconst_idct_red_sse2):
397
398@@ -114,7 +114,7 @@
399 %define WK_NUM		2
400
401 	align	16
402-	global	EXTN(jsimd_idct_4x4_sse2)
403+	global	EXTN(jsimd_idct_4x4_sse2) PRIVATE
404
405 EXTN(jsimd_idct_4x4_sse2):
406 	push	rbp
407@@ -413,7 +413,7 @@
408 ; r13 = JDIMENSION output_col
409
410 	align	16
411-	global	EXTN(jsimd_idct_2x2_sse2)
412+	global	EXTN(jsimd_idct_2x2_sse2) PRIVATE
413
414 EXTN(jsimd_idct_2x2_sse2):
415 	push	rbp
416Index: simd/ji3dnflt.asm
417===================================================================
418--- simd/ji3dnflt.asm	(revision 829)
419+++ simd/ji3dnflt.asm	(working copy)
420@@ -27,7 +27,7 @@
421 	SECTION	SEG_CONST
422
423 	alignz	16
424-	global	EXTN(jconst_idct_float_3dnow)
425+	global	EXTN(jconst_idct_float_3dnow) PRIVATE
426
427 EXTN(jconst_idct_float_3dnow):
428
429@@ -63,7 +63,7 @@
430 					; FAST_FLOAT workspace[DCTSIZE2]
431
432 	align	16
433-	global	EXTN(jsimd_idct_float_3dnow)
434+	global	EXTN(jsimd_idct_float_3dnow) PRIVATE
435
436 EXTN(jsimd_idct_float_3dnow):
437 	push	ebp
438Index: simd/jsimdcpu.asm
439===================================================================
440--- simd/jsimdcpu.asm	(revision 829)
441+++ simd/jsimdcpu.asm	(working copy)
442@@ -29,7 +29,7 @@
443 ;
444
445 	align	16
446-	global	EXTN(jpeg_simd_cpu_support)
447+	global	EXTN(jpeg_simd_cpu_support) PRIVATE
448
449 EXTN(jpeg_simd_cpu_support):
450 	push	ebx
451Index: simd/jdmerss2-64.asm
452===================================================================
453--- simd/jdmerss2-64.asm	(revision 829)
454+++ simd/jdmerss2-64.asm	(working copy)
455@@ -35,7 +35,7 @@
456 	SECTION	SEG_CONST
457
458 	alignz	16
459-	global	EXTN(jconst_merged_upsample_sse2)
460+	global	EXTN(jconst_merged_upsample_sse2) PRIVATE
461
462 EXTN(jconst_merged_upsample_sse2):
463
464Index: simd/jdsammmx.asm
465===================================================================
466--- simd/jdsammmx.asm	(revision 829)
467+++ simd/jdsammmx.asm	(working copy)
468@@ -22,7 +22,7 @@
469 	SECTION	SEG_CONST
470
471 	alignz	16
472-	global	EXTN(jconst_fancy_upsample_mmx)
473+	global	EXTN(jconst_fancy_upsample_mmx) PRIVATE
474
475 EXTN(jconst_fancy_upsample_mmx):
476
477@@ -58,7 +58,7 @@
478 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
479
480 	align	16
481-	global	EXTN(jsimd_h2v1_fancy_upsample_mmx)
482+	global	EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE
483
484 EXTN(jsimd_h2v1_fancy_upsample_mmx):
485 	push	ebp
486@@ -216,7 +216,7 @@
487 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
488
489 	align	16
490-	global	EXTN(jsimd_h2v2_fancy_upsample_mmx)
491+	global	EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE
492
493 EXTN(jsimd_h2v2_fancy_upsample_mmx):
494 	push	ebp
495@@ -542,7 +542,7 @@
496 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
497
498 	align	16
499-	global	EXTN(jsimd_h2v1_upsample_mmx)
500+	global	EXTN(jsimd_h2v1_upsample_mmx) PRIVATE
501
502 EXTN(jsimd_h2v1_upsample_mmx):
503 	push	ebp
504@@ -643,7 +643,7 @@
505 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
506
507 	align	16
508-	global	EXTN(jsimd_h2v2_upsample_mmx)
509+	global	EXTN(jsimd_h2v2_upsample_mmx) PRIVATE
510
511 EXTN(jsimd_h2v2_upsample_mmx):
512 	push	ebp
513Index: simd/jdmrgmmx.asm
514===================================================================
515--- simd/jdmrgmmx.asm	(revision 829)
516+++ simd/jdmrgmmx.asm	(working copy)
517@@ -40,7 +40,7 @@
518 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
519
520 	align	16
521-	global	EXTN(jsimd_h2v1_merged_upsample_mmx)
522+	global	EXTN(jsimd_h2v1_merged_upsample_mmx) PRIVATE
523
524 EXTN(jsimd_h2v1_merged_upsample_mmx):
525 	push	ebp
526@@ -409,7 +409,7 @@
527 %define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
528
529 	align	16
530-	global	EXTN(jsimd_h2v2_merged_upsample_mmx)
531+	global	EXTN(jsimd_h2v2_merged_upsample_mmx) PRIVATE
532
533 EXTN(jsimd_h2v2_merged_upsample_mmx):
534 	push	ebp
535Index: simd/jdsamss2.asm
536===================================================================
537--- simd/jdsamss2.asm	(revision 829)
538+++ simd/jdsamss2.asm	(working copy)
539@@ -22,7 +22,7 @@
540 	SECTION	SEG_CONST
541
542 	alignz	16
543-	global	EXTN(jconst_fancy_upsample_sse2)
544+	global	EXTN(jconst_fancy_upsample_sse2) PRIVATE
545
546 EXTN(jconst_fancy_upsample_sse2):
547
548@@ -58,7 +58,7 @@
549 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
550
551 	align	16
552-	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)
553+	global	EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
554
555 EXTN(jsimd_h2v1_fancy_upsample_sse2):
556 	push	ebp
557@@ -214,7 +214,7 @@
558 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
559
560 	align	16
561-	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)
562+	global	EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
563
564 EXTN(jsimd_h2v2_fancy_upsample_sse2):
565 	push	ebp
566@@ -538,7 +538,7 @@
567 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
568
569 	align	16
570-	global	EXTN(jsimd_h2v1_upsample_sse2)
571+	global	EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
572
573 EXTN(jsimd_h2v1_upsample_sse2):
574 	push	ebp
575@@ -637,7 +637,7 @@
576 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
577
578 	align	16
579-	global	EXTN(jsimd_h2v2_upsample_sse2)
580+	global	EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
581
582 EXTN(jsimd_h2v2_upsample_sse2):
583 	push	ebp
584Index: simd/jiss2flt-64.asm
585===================================================================
586--- simd/jiss2flt-64.asm	(revision 829)
587+++ simd/jiss2flt-64.asm	(working copy)
588@@ -38,7 +38,7 @@
589 	SECTION	SEG_CONST
590
591 	alignz	16
592-	global	EXTN(jconst_idct_float_sse2)
593+	global	EXTN(jconst_idct_float_sse2) PRIVATE
594
595 EXTN(jconst_idct_float_sse2):
596
597@@ -74,7 +74,7 @@
598 					; FAST_FLOAT workspace[DCTSIZE2]
599
600 	align	16
601-	global	EXTN(jsimd_idct_float_sse2)
602+	global	EXTN(jsimd_idct_float_sse2) PRIVATE
603
604 EXTN(jsimd_idct_float_sse2):
605 	push	rbp
606Index: simd/jfss2int-64.asm
607===================================================================
608--- simd/jfss2int-64.asm	(revision 829)
609+++ simd/jfss2int-64.asm	(working copy)
610@@ -67,7 +67,7 @@
611 	SECTION	SEG_CONST
612
613 	alignz	16
614-	global	EXTN(jconst_fdct_islow_sse2)
615+	global	EXTN(jconst_fdct_islow_sse2) PRIVATE
616
617 EXTN(jconst_fdct_islow_sse2):
618
619@@ -101,7 +101,7 @@
620 %define WK_NUM		6
621
622 	align	16
623-	global	EXTN(jsimd_fdct_islow_sse2)
624+	global	EXTN(jsimd_fdct_islow_sse2) PRIVATE
625
626 EXTN(jsimd_fdct_islow_sse2):
627 	push	rbp
628Index: simd/jcqnts2f.asm
629===================================================================
630--- simd/jcqnts2f.asm	(revision 829)
631+++ simd/jcqnts2f.asm	(working copy)
632@@ -35,7 +35,7 @@
633 %define workspace	ebp+16		; FAST_FLOAT * workspace
634
635 	align	16
636-	global	EXTN(jsimd_convsamp_float_sse2)
637+	global	EXTN(jsimd_convsamp_float_sse2) PRIVATE
638
639 EXTN(jsimd_convsamp_float_sse2):
640 	push	ebp
641@@ -115,7 +115,7 @@
642 %define workspace	ebp+16		; FAST_FLOAT * workspace
643
644 	align	16
645-	global	EXTN(jsimd_quantize_float_sse2)
646+	global	EXTN(jsimd_quantize_float_sse2) PRIVATE
647
648 EXTN(jsimd_quantize_float_sse2):
649 	push	ebp
650Index: simd/jdmrgss2.asm
651===================================================================
652--- simd/jdmrgss2.asm	(revision 829)
653+++ simd/jdmrgss2.asm	(working copy)
654@@ -40,7 +40,7 @@
655 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
656
657 	align	16
658-	global	EXTN(jsimd_h2v1_merged_upsample_sse2)
659+	global	EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
660
661 EXTN(jsimd_h2v1_merged_upsample_sse2):
662 	push	ebp
663@@ -560,7 +560,7 @@
664 %define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
665
666 	align	16
667-	global	EXTN(jsimd_h2v2_merged_upsample_sse2)
668+	global	EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
669
670 EXTN(jsimd_h2v2_merged_upsample_sse2):
671 	push	ebp
672Index: simd/jfmmxint.asm
673===================================================================
674--- simd/jfmmxint.asm	(revision 829)
675+++ simd/jfmmxint.asm	(working copy)
676@@ -66,7 +66,7 @@
677 	SECTION	SEG_CONST
678
679 	alignz	16
680-	global	EXTN(jconst_fdct_islow_mmx)
681+	global	EXTN(jconst_fdct_islow_mmx) PRIVATE
682
683 EXTN(jconst_fdct_islow_mmx):
684
685@@ -101,7 +101,7 @@
686 %define WK_NUM		2
687
688 	align	16
689-	global	EXTN(jsimd_fdct_islow_mmx)
690+	global	EXTN(jsimd_fdct_islow_mmx) PRIVATE
691
692 EXTN(jsimd_fdct_islow_mmx):
693 	push	ebp
694Index: simd/jcgryss2-64.asm
695===================================================================
696--- simd/jcgryss2-64.asm	(revision 829)
697+++ simd/jcgryss2-64.asm	(working copy)
698@@ -37,7 +37,7 @@
699
700 	align	16
701
702-	global	EXTN(jsimd_rgb_gray_convert_sse2)
703+	global	EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
704
705 EXTN(jsimd_rgb_gray_convert_sse2):
706 	push	rbp
707Index: simd/jcqnts2i.asm
708===================================================================
709--- simd/jcqnts2i.asm	(revision 829)
710+++ simd/jcqnts2i.asm	(working copy)
711@@ -35,7 +35,7 @@
712 %define workspace	ebp+16		; DCTELEM * workspace
713
714 	align	16
715-	global	EXTN(jsimd_convsamp_sse2)
716+	global	EXTN(jsimd_convsamp_sse2) PRIVATE
717
718 EXTN(jsimd_convsamp_sse2):
719 	push	ebp
720@@ -117,7 +117,7 @@
721 %define workspace	ebp+16		; DCTELEM * workspace
722
723 	align	16
724-	global	EXTN(jsimd_quantize_sse2)
725+	global	EXTN(jsimd_quantize_sse2) PRIVATE
726
727 EXTN(jsimd_quantize_sse2):
728 	push	ebp
729Index: simd/jiss2fst-64.asm
730===================================================================
731--- simd/jiss2fst-64.asm	(revision 829)
732+++ simd/jiss2fst-64.asm	(working copy)
733@@ -60,7 +60,7 @@
734 %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
735
736 	alignz	16
737-	global	EXTN(jconst_idct_ifast_sse2)
738+	global	EXTN(jconst_idct_ifast_sse2) PRIVATE
739
740 EXTN(jconst_idct_ifast_sse2):
741
742@@ -93,7 +93,7 @@
743 %define WK_NUM		2
744
745 	align	16
746-	global	EXTN(jsimd_idct_ifast_sse2)
747+	global	EXTN(jsimd_idct_ifast_sse2) PRIVATE
748
749 EXTN(jsimd_idct_ifast_sse2):
750 	push	rbp
751Index: simd/jiss2flt.asm
752===================================================================
753--- simd/jiss2flt.asm	(revision 829)
754+++ simd/jiss2flt.asm	(working copy)
755@@ -37,7 +37,7 @@
756 	SECTION	SEG_CONST
757
758 	alignz	16
759-	global	EXTN(jconst_idct_float_sse2)
760+	global	EXTN(jconst_idct_float_sse2) PRIVATE
761
762 EXTN(jconst_idct_float_sse2):
763
764@@ -73,7 +73,7 @@
765 					; FAST_FLOAT workspace[DCTSIZE2]
766
767 	align	16
768-	global	EXTN(jsimd_idct_float_sse2)
769+	global	EXTN(jsimd_idct_float_sse2) PRIVATE
770
771 EXTN(jsimd_idct_float_sse2):
772 	push	ebp
773Index: simd/jiss2int.asm
774===================================================================
775--- simd/jiss2int.asm	(revision 829)
776+++ simd/jiss2int.asm	(working copy)
777@@ -66,7 +66,7 @@
778 	SECTION	SEG_CONST
779
780 	alignz	16
781-	global	EXTN(jconst_idct_islow_sse2)
782+	global	EXTN(jconst_idct_islow_sse2) PRIVATE
783
784 EXTN(jconst_idct_islow_sse2):
785
786@@ -105,7 +105,7 @@
787 %define WK_NUM		12
788
789 	align	16
790-	global	EXTN(jsimd_idct_islow_sse2)
791+	global	EXTN(jsimd_idct_islow_sse2) PRIVATE
792
793 EXTN(jsimd_idct_islow_sse2):
794 	push	ebp
795Index: simd/jfsseflt-64.asm
796===================================================================
797--- simd/jfsseflt-64.asm	(revision 829)
798+++ simd/jfsseflt-64.asm	(working copy)
799@@ -38,7 +38,7 @@
800 	SECTION	SEG_CONST
801
802 	alignz	16
803-	global	EXTN(jconst_fdct_float_sse)
804+	global	EXTN(jconst_fdct_float_sse) PRIVATE
805
806 EXTN(jconst_fdct_float_sse):
807
808@@ -65,7 +65,7 @@
809 %define WK_NUM		2
810
811 	align	16
812-	global	EXTN(jsimd_fdct_float_sse)
813+	global	EXTN(jsimd_fdct_float_sse) PRIVATE
814
815 EXTN(jsimd_fdct_float_sse):
816 	push	rbp
817Index: simd/jccolss2-64.asm
818===================================================================
819--- simd/jccolss2-64.asm	(revision 829)
820+++ simd/jccolss2-64.asm	(working copy)
821@@ -34,7 +34,7 @@
822 	SECTION	SEG_CONST
823
824 	alignz	16
825-	global	EXTN(jconst_rgb_ycc_convert_sse2)
826+	global	EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
827
828 EXTN(jconst_rgb_ycc_convert_sse2):
829
830Index: simd/jcsamss2-64.asm
831===================================================================
832--- simd/jcsamss2-64.asm	(revision 829)
833+++ simd/jcsamss2-64.asm	(working copy)
834@@ -41,7 +41,7 @@
835 ; r15 = JSAMPARRAY output_data
836
837 	align	16
838-	global	EXTN(jsimd_h2v1_downsample_sse2)
839+	global	EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
840
841 EXTN(jsimd_h2v1_downsample_sse2):
842 	push	rbp
843@@ -185,7 +185,7 @@
844 ; r15 = JSAMPARRAY output_data
845
846 	align	16
847-	global	EXTN(jsimd_h2v2_downsample_sse2)
848+	global	EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
849
850 EXTN(jsimd_h2v2_downsample_sse2):
851 	push	rbp
852Index: simd/jdclrss2-64.asm
853===================================================================
854--- simd/jdclrss2-64.asm	(revision 829)
855+++ simd/jdclrss2-64.asm	(working copy)
856@@ -39,7 +39,7 @@
857 %define WK_NUM		2
858
859 	align	16
860-	global	EXTN(jsimd_ycc_rgb_convert_sse2)
861+	global	EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
862
863 EXTN(jsimd_ycc_rgb_convert_sse2):
864 	push	rbp
865Index: simd/jdcolmmx.asm
866===================================================================
867--- simd/jdcolmmx.asm	(revision 829)
868+++ simd/jdcolmmx.asm	(working copy)
869@@ -35,7 +35,7 @@
870 	SECTION	SEG_CONST
871
872 	alignz	16
873-	global	EXTN(jconst_ycc_rgb_convert_mmx)
874+	global	EXTN(jconst_ycc_rgb_convert_mmx) PRIVATE
875
876 EXTN(jconst_ycc_rgb_convert_mmx):
877
878Index: simd/jcclrmmx.asm
879===================================================================
880--- simd/jcclrmmx.asm	(revision 829)
881+++ simd/jcclrmmx.asm	(working copy)
882@@ -40,7 +40,7 @@
883 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
884
885 	align	16
886-	global	EXTN(jsimd_rgb_ycc_convert_mmx)
887+	global	EXTN(jsimd_rgb_ycc_convert_mmx) PRIVATE
888
889 EXTN(jsimd_rgb_ycc_convert_mmx):
890 	push	ebp
891Index: simd/jfsseflt.asm
892===================================================================
893--- simd/jfsseflt.asm	(revision 829)
894+++ simd/jfsseflt.asm	(working copy)
895@@ -37,7 +37,7 @@
896 	SECTION	SEG_CONST
897
898 	alignz	16
899-	global	EXTN(jconst_fdct_float_sse)
900+	global	EXTN(jconst_fdct_float_sse) PRIVATE
901
902 EXTN(jconst_fdct_float_sse):
903
904@@ -65,7 +65,7 @@
905 %define WK_NUM		2
906
907 	align	16
908-	global	EXTN(jsimd_fdct_float_sse)
909+	global	EXTN(jsimd_fdct_float_sse) PRIVATE
910
911 EXTN(jsimd_fdct_float_sse):
912 	push	ebp
913Index: simd/jdmrgss2-64.asm
914===================================================================
915--- simd/jdmrgss2-64.asm	(revision 829)
916+++ simd/jdmrgss2-64.asm	(working copy)
917@@ -39,7 +39,7 @@
918 %define WK_NUM		3
919
920 	align	16
921-	global	EXTN(jsimd_h2v1_merged_upsample_sse2)
922+	global	EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
923
924 EXTN(jsimd_h2v1_merged_upsample_sse2):
925 	push	rbp
926@@ -543,7 +543,7 @@
927 ; r13 = JSAMPARRAY output_buf
928
929 	align	16
930-	global	EXTN(jsimd_h2v2_merged_upsample_sse2)
931+	global	EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
932
933 EXTN(jsimd_h2v2_merged_upsample_sse2):
934 	push	rbp
935Index: simd/jdcolss2.asm
936===================================================================
937--- simd/jdcolss2.asm	(revision 829)
938+++ simd/jdcolss2.asm	(working copy)
939@@ -35,7 +35,7 @@
940 	SECTION	SEG_CONST
941
942 	alignz	16
943-	global	EXTN(jconst_ycc_rgb_convert_sse2)
944+	global	EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
945
946 EXTN(jconst_ycc_rgb_convert_sse2):
947
948Index: simd/jdmermmx.asm
949===================================================================
950--- simd/jdmermmx.asm	(revision 829)
951+++ simd/jdmermmx.asm	(working copy)
952@@ -35,7 +35,7 @@
953 	SECTION	SEG_CONST
954
955 	alignz	16
956-	global	EXTN(jconst_merged_upsample_mmx)
957+	global	EXTN(jconst_merged_upsample_mmx) PRIVATE
958
959 EXTN(jconst_merged_upsample_mmx):
960
961Index: simd/jcclrss2.asm
962===================================================================
963--- simd/jcclrss2.asm	(revision 829)
964+++ simd/jcclrss2.asm	(working copy)
965@@ -38,7 +38,7 @@
966
967 	align	16
968
969-	global	EXTN(jsimd_rgb_ycc_convert_sse2)
970+	global	EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
971
972 EXTN(jsimd_rgb_ycc_convert_sse2):
973 	push	ebp
974Index: simd/jiss2red.asm
975===================================================================
976--- simd/jiss2red.asm	(revision 829)
977+++ simd/jiss2red.asm	(working copy)
978@@ -72,7 +72,7 @@
979 	SECTION	SEG_CONST
980
981 	alignz	16
982-	global	EXTN(jconst_idct_red_sse2)
983+	global	EXTN(jconst_idct_red_sse2) PRIVATE
984
985 EXTN(jconst_idct_red_sse2):
986
987@@ -113,7 +113,7 @@
988 %define WK_NUM		2
989
990 	align	16
991-	global	EXTN(jsimd_idct_4x4_sse2)
992+	global	EXTN(jsimd_idct_4x4_sse2) PRIVATE
993
994 EXTN(jsimd_idct_4x4_sse2):
995 	push	ebp
996@@ -424,7 +424,7 @@
997 %define output_col(b)	(b)+20		; JDIMENSION output_col
998
999 	align	16
1000-	global	EXTN(jsimd_idct_2x2_sse2)
1001+	global	EXTN(jsimd_idct_2x2_sse2) PRIVATE
1002
1003 EXTN(jsimd_idct_2x2_sse2):
1004 	push	ebp
1005Index: simd/jdmerss2.asm
1006===================================================================
1007--- simd/jdmerss2.asm	(revision 829)
1008+++ simd/jdmerss2.asm	(working copy)
1009@@ -35,7 +35,7 @@
1010 	SECTION	SEG_CONST
1011
1012 	alignz	16
1013-	global	EXTN(jconst_merged_upsample_sse2)
1014+	global	EXTN(jconst_merged_upsample_sse2) PRIVATE
1015
1016 EXTN(jconst_merged_upsample_sse2):
1017
1018Index: simd/jfss2fst-64.asm
1019===================================================================
1020--- simd/jfss2fst-64.asm	(revision 829)
1021+++ simd/jfss2fst-64.asm	(working copy)
1022@@ -53,7 +53,7 @@
1023 %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1024
1025 	alignz	16
1026-	global	EXTN(jconst_fdct_ifast_sse2)
1027+	global	EXTN(jconst_fdct_ifast_sse2) PRIVATE
1028
1029 EXTN(jconst_fdct_ifast_sse2):
1030
1031@@ -80,7 +80,7 @@
1032 %define WK_NUM		2
1033
1034 	align	16
1035-	global	EXTN(jsimd_fdct_ifast_sse2)
1036+	global	EXTN(jsimd_fdct_ifast_sse2) PRIVATE
1037
1038 EXTN(jsimd_fdct_ifast_sse2):
1039 	push	rbp
1040Index: simd/jcqntmmx.asm
1041===================================================================
1042--- simd/jcqntmmx.asm	(revision 829)
1043+++ simd/jcqntmmx.asm	(working copy)
1044@@ -35,7 +35,7 @@
1045 %define workspace	ebp+16		; DCTELEM * workspace
1046
1047 	align	16
1048-	global	EXTN(jsimd_convsamp_mmx)
1049+	global	EXTN(jsimd_convsamp_mmx) PRIVATE
1050
1051 EXTN(jsimd_convsamp_mmx):
1052 	push	ebp
1053@@ -140,7 +140,7 @@
1054 %define workspace	ebp+16		; DCTELEM * workspace
1055
1056 	align	16
1057-	global	EXTN(jsimd_quantize_mmx)
1058+	global	EXTN(jsimd_quantize_mmx) PRIVATE
1059
1060 EXTN(jsimd_quantize_mmx):
1061 	push	ebp
1062Index: simd/jimmxfst.asm
1063===================================================================
1064--- simd/jimmxfst.asm	(revision 829)
1065+++ simd/jimmxfst.asm	(working copy)
1066@@ -59,7 +59,7 @@
1067 %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1068
1069 	alignz	16
1070-	global	EXTN(jconst_idct_ifast_mmx)
1071+	global	EXTN(jconst_idct_ifast_mmx) PRIVATE
1072
1073 EXTN(jconst_idct_ifast_mmx):
1074
1075@@ -94,7 +94,7 @@
1076 					; JCOEF workspace[DCTSIZE2]
1077
1078 	align	16
1079-	global	EXTN(jsimd_idct_ifast_mmx)
1080+	global	EXTN(jsimd_idct_ifast_mmx) PRIVATE
1081
1082 EXTN(jsimd_idct_ifast_mmx):
1083 	push	ebp
1084Index: simd/jfss2fst.asm
1085===================================================================
1086--- simd/jfss2fst.asm	(revision 829)
1087+++ simd/jfss2fst.asm	(working copy)
1088@@ -52,7 +52,7 @@
1089 %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1090
1091 	alignz	16
1092-	global	EXTN(jconst_fdct_ifast_sse2)
1093+	global	EXTN(jconst_fdct_ifast_sse2) PRIVATE
1094
1095 EXTN(jconst_fdct_ifast_sse2):
1096
1097@@ -80,7 +80,7 @@
1098 %define WK_NUM		2
1099
1100 	align	16
1101-	global	EXTN(jsimd_fdct_ifast_sse2)
1102+	global	EXTN(jsimd_fdct_ifast_sse2) PRIVATE
1103
1104 EXTN(jsimd_fdct_ifast_sse2):
1105 	push	ebp
1106Index: simd/jcgrammx.asm
1107===================================================================
1108--- simd/jcgrammx.asm	(revision 829)
1109+++ simd/jcgrammx.asm	(working copy)
1110@@ -33,7 +33,7 @@
1111 	SECTION	SEG_CONST
1112
1113 	alignz	16
1114-	global	EXTN(jconst_rgb_gray_convert_mmx)
1115+	global	EXTN(jconst_rgb_gray_convert_mmx) PRIVATE
1116
1117 EXTN(jconst_rgb_gray_convert_mmx):
1118
1119Index: simd/jdcolss2-64.asm
1120===================================================================
1121--- simd/jdcolss2-64.asm	(revision 829)
1122+++ simd/jdcolss2-64.asm	(working copy)
1123@@ -35,7 +35,7 @@
1124 	SECTION	SEG_CONST
1125
1126 	alignz	16
1127-	global	EXTN(jconst_ycc_rgb_convert_sse2)
1128+	global	EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
1129
1130 EXTN(jconst_ycc_rgb_convert_sse2):
1131
1132Index: simd/jf3dnflt.asm
1133===================================================================
1134--- simd/jf3dnflt.asm	(revision 829)
1135+++ simd/jf3dnflt.asm	(working copy)
1136@@ -27,7 +27,7 @@
1137 	SECTION	SEG_CONST
1138
1139 	alignz	16
1140-	global	EXTN(jconst_fdct_float_3dnow)
1141+	global	EXTN(jconst_fdct_float_3dnow) PRIVATE
1142
1143 EXTN(jconst_fdct_float_3dnow):
1144
1145@@ -55,7 +55,7 @@
1146 %define WK_NUM		2
1147
1148 	align	16
1149-	global	EXTN(jsimd_fdct_float_3dnow)
1150+	global	EXTN(jsimd_fdct_float_3dnow) PRIVATE
1151
1152 EXTN(jsimd_fdct_float_3dnow):
1153 	push	ebp
1154Index: simd/jdsamss2-64.asm
1155===================================================================
1156--- simd/jdsamss2-64.asm	(revision 829)
1157+++ simd/jdsamss2-64.asm	(working copy)
1158@@ -23,7 +23,7 @@
1159 	SECTION	SEG_CONST
1160
1161 	alignz	16
1162-	global	EXTN(jconst_fancy_upsample_sse2)
1163+	global	EXTN(jconst_fancy_upsample_sse2) PRIVATE
1164
1165 EXTN(jconst_fancy_upsample_sse2):
1166
1167@@ -59,7 +59,7 @@
1168 ; r13 = JSAMPARRAY * output_data_ptr
1169
1170 	align	16
1171-	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)
1172+	global	EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
1173
1174 EXTN(jsimd_h2v1_fancy_upsample_sse2):
1175 	push	rbp
1176@@ -201,7 +201,7 @@
1177 %define WK_NUM		4
1178
1179 	align	16
1180-	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)
1181+	global	EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
1182
1183 EXTN(jsimd_h2v2_fancy_upsample_sse2):
1184 	push	rbp
1185@@ -498,7 +498,7 @@
1186 ; r13 = JSAMPARRAY * output_data_ptr
1187
1188 	align	16
1189-	global	EXTN(jsimd_h2v1_upsample_sse2)
1190+	global	EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
1191
1192 EXTN(jsimd_h2v1_upsample_sse2):
1193 	push	rbp
1194@@ -587,7 +587,7 @@
1195 ; r13 = JSAMPARRAY * output_data_ptr
1196
1197 	align	16
1198-	global	EXTN(jsimd_h2v2_upsample_sse2)
1199+	global	EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
1200
1201 EXTN(jsimd_h2v2_upsample_sse2):
1202 	push	rbp
1203Index: simd/jcgrass2.asm
1204===================================================================
1205--- simd/jcgrass2.asm	(revision 829)
1206+++ simd/jcgrass2.asm	(working copy)
1207@@ -30,7 +30,7 @@
1208 	SECTION	SEG_CONST
1209
1210 	alignz	16
1211-	global	EXTN(jconst_rgb_gray_convert_sse2)
1212+	global	EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
1213
1214 EXTN(jconst_rgb_gray_convert_sse2):
1215
1216Index: simd/jcsammmx.asm
1217===================================================================
1218--- simd/jcsammmx.asm	(revision 829)
1219+++ simd/jcsammmx.asm	(working copy)
1220@@ -40,7 +40,7 @@
1221 %define output_data(b)	(b)+28	; JSAMPARRAY output_data
1222
1223 	align	16
1224-	global	EXTN(jsimd_h2v1_downsample_mmx)
1225+	global	EXTN(jsimd_h2v1_downsample_mmx) PRIVATE
1226
1227 EXTN(jsimd_h2v1_downsample_mmx):
1228 	push	ebp
1229@@ -182,7 +182,7 @@
1230 %define output_data(b)	(b)+28	; JSAMPARRAY output_data
1231
1232 	align	16
1233-	global	EXTN(jsimd_h2v2_downsample_mmx)
1234+	global	EXTN(jsimd_h2v2_downsample_mmx) PRIVATE
1235
1236 EXTN(jsimd_h2v2_downsample_mmx):
1237 	push	ebp
1238Index: simd/jsimd_arm_neon.S
1239===================================================================
1240--- simd/jsimd_arm_neon.S	(revision 272637)
1241+++ simd/jsimd_arm_neon.S	(working copy)
1242@@ -41,11 +41,9 @@
1243 /* Supplementary macro for setting function attributes */
1244 .macro asm_function fname
1245 #ifdef __APPLE__
1246-    .func _\fname
1247     .globl _\fname
1248 _\fname:
1249 #else
1250-    .func \fname
1251     .global \fname
1252 #ifdef __ELF__
1253     .hidden \fname
1254@@ -670,7 +668,6 @@
1255     .unreq          ROW6R
1256     .unreq          ROW7L
1257     .unreq          ROW7R
1258-.endfunc
1259
1260
1261 /*****************************************************************************/
1262@@ -895,7 +892,6 @@
1263     .unreq          TMP2
1264     .unreq          TMP3
1265     .unreq          TMP4
1266-.endfunc
1267
1268
1269 /*****************************************************************************/
1270@@ -1108,7 +1104,6 @@
1271     .unreq          TMP2
1272     .unreq          TMP3
1273     .unreq          TMP4
1274-.endfunc
1275
1276 .purgem idct_helper
1277
1278@@ -1263,7 +1258,6 @@
1279     .unreq          OUTPUT_COL
1280     .unreq          TMP1
1281     .unreq          TMP2
1282-.endfunc
1283
1284 .purgem idct_helper
1285
1286@@ -1547,7 +1541,6 @@
1287     .unreq          U
1288     .unreq          V
1289     .unreq          N
1290-.endfunc
1291
1292 .purgem do_yuv_to_rgb
1293 .purgem do_yuv_to_rgb_stage1
1294@@ -1858,7 +1851,6 @@
1295     .unreq          U
1296     .unreq          V
1297     .unreq          N
1298-.endfunc
1299
1300 .purgem do_rgb_to_yuv
1301 .purgem do_rgb_to_yuv_stage1
1302@@ -1940,7 +1932,6 @@
1303     .unreq          TMP2
1304     .unreq          TMP3
1305     .unreq          TMP4
1306-.endfunc
1307
1308
1309 /*****************************************************************************/
1310@@ -2064,7 +2055,6 @@
1311
1312     .unreq          DATA
1313     .unreq          TMP
1314-.endfunc
1315
1316
1317 /*****************************************************************************/
1318@@ -2166,7 +2156,6 @@
1319     .unreq          CORRECTION
1320     .unreq          SHIFT
1321     .unreq          LOOP_COUNT
1322-.endfunc
1323
1324
1325 /*****************************************************************************/
1326@@ -2401,7 +2390,6 @@
1327     .unreq          WIDTH
1328     .unreq          TMP
1329
1330-.endfunc
1331
1332 .purgem upsample16
1333 .purgem upsample32
1334Index: simd/jsimd_i386.c
1335===================================================================
1336--- simd/jsimd_i386.c	(revision 829)
1337+++ simd/jsimd_i386.c	(working copy)
1338@@ -61,6 +61,7 @@
1339     simd_support &= JSIMD_SSE2;
1340 }
1341
1342+#ifndef JPEG_DECODE_ONLY
1343 GLOBAL(int)
1344 jsimd_can_rgb_ycc (void)
1345 {
1346@@ -82,6 +83,7 @@
1347
1348   return 0;
1349 }
1350+#endif
1351
1352 GLOBAL(int)
1353 jsimd_can_rgb_gray (void)
1354@@ -127,6 +129,7 @@
1355   return 0;
1356 }
1357
1358+#ifndef JPEG_DECODE_ONLY
1359 GLOBAL(void)
1360 jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
1361                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
1362@@ -179,6 +182,7 @@
1363     mmxfct(cinfo->image_width, input_buf,
1364         output_buf, output_row, num_rows);
1365 }
1366+#endif
1367
1368 GLOBAL(void)
1369 jsimd_rgb_gray_convert (j_compress_ptr cinfo,
1370@@ -286,6 +290,7 @@
1371         input_row, output_buf, num_rows);
1372 }
1373
1374+#ifndef JPEG_DECODE_ONLY
1375 GLOBAL(int)
1376 jsimd_can_h2v2_downsample (void)
1377 {
1378@@ -351,6 +356,7 @@
1379         compptr->v_samp_factor, compptr->width_in_blocks,
1380         input_data, output_data);
1381 }
1382+#endif
1383
1384 GLOBAL(int)
1385 jsimd_can_h2v2_upsample (void)
1386@@ -636,6 +642,7 @@
1387         in_row_group_ctr, output_buf);
1388 }
1389
1390+#ifndef JPEG_DECODE_ONLY
1391 GLOBAL(int)
1392 jsimd_can_convsamp (void)
1393 {
1394@@ -855,6 +862,7 @@
1395   else if (simd_support & JSIMD_3DNOW)
1396     jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
1397 }
1398+#endif
1399
1400 GLOBAL(int)
1401 jsimd_can_idct_2x2 (void)
1402@@ -1045,4 +1053,3 @@
1403     jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
1404         output_buf, output_col);
1405 }
1406-
1407Index: simd/jcqnts2f-64.asm
1408===================================================================
1409--- simd/jcqnts2f-64.asm	(revision 829)
1410+++ simd/jcqnts2f-64.asm	(working copy)
1411@@ -36,7 +36,7 @@
1412 ; r12 = FAST_FLOAT * workspace
1413
1414 	align	16
1415-	global	EXTN(jsimd_convsamp_float_sse2)
1416+	global	EXTN(jsimd_convsamp_float_sse2) PRIVATE
1417
1418 EXTN(jsimd_convsamp_float_sse2):
1419 	push	rbp
1420@@ -110,7 +110,7 @@
1421 ; r12 = FAST_FLOAT * workspace
1422
1423 	align	16
1424-	global	EXTN(jsimd_quantize_float_sse2)
1425+	global	EXTN(jsimd_quantize_float_sse2) PRIVATE
1426
1427 EXTN(jsimd_quantize_float_sse2):
1428 	push	rbp
1429Index: simd/jcqnt3dn.asm
1430===================================================================
1431--- simd/jcqnt3dn.asm	(revision 829)
1432+++ simd/jcqnt3dn.asm	(working copy)
1433@@ -35,7 +35,7 @@
1434 %define workspace	ebp+16		; FAST_FLOAT * workspace
1435
1436 	align	16
1437-	global	EXTN(jsimd_convsamp_float_3dnow)
1438+	global	EXTN(jsimd_convsamp_float_3dnow) PRIVATE
1439
1440 EXTN(jsimd_convsamp_float_3dnow):
1441 	push	ebp
1442@@ -138,7 +138,7 @@
1443 %define workspace	ebp+16		; FAST_FLOAT * workspace
1444
1445 	align	16
1446-	global	EXTN(jsimd_quantize_float_3dnow)
1447+	global	EXTN(jsimd_quantize_float_3dnow) PRIVATE
1448
1449 EXTN(jsimd_quantize_float_3dnow):
1450 	push	ebp
1451Index: simd/jcsamss2.asm
1452===================================================================
1453--- simd/jcsamss2.asm	(revision 829)
1454+++ simd/jcsamss2.asm	(working copy)
1455@@ -40,7 +40,7 @@
1456 %define output_data(b)	(b)+28		; JSAMPARRAY output_data
1457
1458 	align	16
1459-	global	EXTN(jsimd_h2v1_downsample_sse2)
1460+	global	EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
1461
1462 EXTN(jsimd_h2v1_downsample_sse2):
1463 	push	ebp
1464@@ -195,7 +195,7 @@
1465 %define output_data(b)	(b)+28	; JSAMPARRAY output_data
1466
1467 	align	16
1468-	global	EXTN(jsimd_h2v2_downsample_sse2)
1469+	global	EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
1470
1471 EXTN(jsimd_h2v2_downsample_sse2):
1472 	push	ebp
1473Index: simd/jsimd_x86_64.c
1474===================================================================
1475--- simd/jsimd_x86_64.c	(revision 829)
1476+++ simd/jsimd_x86_64.c	(working copy)
1477@@ -29,6 +29,7 @@
1478
1479 #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
1480
1481+#ifndef JPEG_DECODE_ONLY
1482 GLOBAL(int)
1483 jsimd_can_rgb_ycc (void)
1484 {
1485@@ -45,6 +46,7 @@
1486
1487   return 1;
1488 }
1489+#endif
1490
1491 GLOBAL(int)
1492 jsimd_can_rgb_gray (void)
1493@@ -80,6 +82,7 @@
1494   return 1;
1495 }
1496
1497+#ifndef JPEG_DECODE_ONLY
1498 GLOBAL(void)
1499 jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
1500                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
1501@@ -118,6 +121,7 @@
1502
1503   sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
1504 }
1505+#endif
1506
1507 GLOBAL(void)
1508 jsimd_rgb_gray_convert (j_compress_ptr cinfo,
1509@@ -197,6 +201,7 @@
1510   sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
1511 }
1512
1513+#ifndef JPEG_DECODE_ONLY
1514 GLOBAL(int)
1515 jsimd_can_h2v2_downsample (void)
1516 {
1517@@ -242,6 +247,7 @@
1518                              compptr->width_in_blocks,
1519                              input_data, output_data);
1520 }
1521+#endif
1522
1523 GLOBAL(int)
1524 jsimd_can_h2v2_upsample (void)
1525@@ -451,6 +457,7 @@
1526   sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
1527 }
1528
1529+#ifndef JPEG_DECODE_ONLY
1530 GLOBAL(int)
1531 jsimd_can_convsamp (void)
1532 {
1533@@ -601,6 +608,7 @@
1534 {
1535   jsimd_quantize_float_sse2(coef_block, divisors, workspace);
1536 }
1537+#endif
1538
1539 GLOBAL(int)
1540 jsimd_can_idct_2x2 (void)
1541@@ -750,4 +758,3 @@
1542   jsimd_idct_float_sse2(compptr->dct_table, coef_block,
1543                         output_buf, output_col);
1544 }
1545-
1546Index: simd/jimmxint.asm
1547===================================================================
1548--- simd/jimmxint.asm	(revision 829)
1549+++ simd/jimmxint.asm	(working copy)
1550@@ -66,7 +66,7 @@
1551 	SECTION	SEG_CONST
1552
1553 	alignz	16
1554-	global	EXTN(jconst_idct_islow_mmx)
1555+	global	EXTN(jconst_idct_islow_mmx) PRIVATE
1556
1557 EXTN(jconst_idct_islow_mmx):
1558
1559@@ -107,7 +107,7 @@
1560 					; JCOEF workspace[DCTSIZE2]
1561
1562 	align	16
1563-	global	EXTN(jsimd_idct_islow_mmx)
1564+	global	EXTN(jsimd_idct_islow_mmx) PRIVATE
1565
1566 EXTN(jsimd_idct_islow_mmx):
1567 	push	ebp
1568Index: simd/jcgrymmx.asm
1569===================================================================
1570--- simd/jcgrymmx.asm	(revision 829)
1571+++ simd/jcgrymmx.asm	(working copy)
1572@@ -41,7 +41,7 @@
1573 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
1574
1575 	align	16
1576-	global	EXTN(jsimd_rgb_gray_convert_mmx)
1577+	global	EXTN(jsimd_rgb_gray_convert_mmx) PRIVATE
1578
1579 EXTN(jsimd_rgb_gray_convert_mmx):
1580 	push	ebp
1581Index: simd/jfss2int.asm
1582===================================================================
1583--- simd/jfss2int.asm	(revision 829)
1584+++ simd/jfss2int.asm	(working copy)
1585@@ -66,7 +66,7 @@
1586 	SECTION	SEG_CONST
1587
1588 	alignz	16
1589-	global	EXTN(jconst_fdct_islow_sse2)
1590+	global	EXTN(jconst_fdct_islow_sse2) PRIVATE
1591
1592 EXTN(jconst_fdct_islow_sse2):
1593
1594@@ -101,7 +101,7 @@
1595 %define WK_NUM		6
1596
1597 	align	16
1598-	global	EXTN(jsimd_fdct_islow_sse2)
1599+	global	EXTN(jsimd_fdct_islow_sse2) PRIVATE
1600
1601 EXTN(jsimd_fdct_islow_sse2):
1602 	push	ebp
1603Index: simd/jcgryss2.asm
1604===================================================================
1605--- simd/jcgryss2.asm	(revision 829)
1606+++ simd/jcgryss2.asm	(working copy)
1607@@ -39,7 +39,7 @@
1608
1609 	align	16
1610
1611-	global	EXTN(jsimd_rgb_gray_convert_sse2)
1612+	global	EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
1613
1614 EXTN(jsimd_rgb_gray_convert_sse2):
1615 	push	ebp
1616Index: simd/jccolmmx.asm
1617===================================================================
1618--- simd/jccolmmx.asm	(revision 829)
1619+++ simd/jccolmmx.asm	(working copy)
1620@@ -37,7 +37,7 @@
1621 	SECTION	SEG_CONST
1622
1623 	alignz	16
1624-	global	EXTN(jconst_rgb_ycc_convert_mmx)
1625+	global	EXTN(jconst_rgb_ycc_convert_mmx) PRIVATE
1626
1627 EXTN(jconst_rgb_ycc_convert_mmx):
1628
1629Index: simd/jimmxred.asm
1630===================================================================
1631--- simd/jimmxred.asm	(revision 829)
1632+++ simd/jimmxred.asm	(working copy)
1633@@ -72,7 +72,7 @@
1634 	SECTION	SEG_CONST
1635
1636 	alignz	16
1637-	global	EXTN(jconst_idct_red_mmx)
1638+	global	EXTN(jconst_idct_red_mmx) PRIVATE
1639
1640 EXTN(jconst_idct_red_mmx):
1641
1642@@ -115,7 +115,7 @@
1643 					; JCOEF workspace[DCTSIZE2]
1644
1645 	align	16
1646-	global	EXTN(jsimd_idct_4x4_mmx)
1647+	global	EXTN(jsimd_idct_4x4_mmx) PRIVATE
1648
1649 EXTN(jsimd_idct_4x4_mmx):
1650 	push	ebp
1651@@ -503,7 +503,7 @@
1652 %define output_col(b)	(b)+20		; JDIMENSION output_col
1653
1654 	align	16
1655-	global	EXTN(jsimd_idct_2x2_mmx)
1656+	global	EXTN(jsimd_idct_2x2_mmx) PRIVATE
1657
1658 EXTN(jsimd_idct_2x2_mmx):
1659 	push	ebp
1660Index: simd/jsimdext.inc
1661===================================================================
1662--- simd/jsimdext.inc	(revision 829)
1663+++ simd/jsimdext.inc	(working copy)
1664@@ -73,6 +73,9 @@
1665 ; * *BSD family Unix using elf format
1666 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
1667
1668+; PIC is the default on Linux
1669+%define PIC
1670+
1671 ; mark stack as non-executable
1672 section .note.GNU-stack noalloc noexec nowrite progbits
1673
1674@@ -375,4 +378,14 @@
1675 ;
1676 %include "jsimdcfg.inc"
1677
1678+; Begin chromium edits
1679+%ifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
1680+%define PRIVATE :private_extern
1681+%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
1682+%define PRIVATE :hidden
1683+%else
1684+%define PRIVATE
1685+%endif
1686+; End chromium edits
1687+
1688 ; --------------------------------------------------------------------------
1689Index: simd/jdclrmmx.asm
1690===================================================================
1691--- simd/jdclrmmx.asm	(revision 829)
1692+++ simd/jdclrmmx.asm	(working copy)
1693@@ -40,7 +40,7 @@
1694 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
1695
1696 	align	16
1697-	global	EXTN(jsimd_ycc_rgb_convert_mmx)
1698+	global	EXTN(jsimd_ycc_rgb_convert_mmx) PRIVATE
1699
1700 EXTN(jsimd_ycc_rgb_convert_mmx):
1701 	push	ebp
1702Index: simd/jccolss2.asm
1703===================================================================
1704--- simd/jccolss2.asm	(revision 829)
1705+++ simd/jccolss2.asm	(working copy)
1706@@ -34,7 +34,7 @@
1707 	SECTION	SEG_CONST
1708
1709 	alignz	16
1710-	global	EXTN(jconst_rgb_ycc_convert_sse2)
1711+	global	EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
1712
1713 EXTN(jconst_rgb_ycc_convert_sse2):
1714
1715Index: simd/jisseflt.asm
1716===================================================================
1717--- simd/jisseflt.asm	(revision 829)
1718+++ simd/jisseflt.asm	(working copy)
1719@@ -37,7 +37,7 @@
1720 	SECTION	SEG_CONST
1721
1722 	alignz	16
1723-	global	EXTN(jconst_idct_float_sse)
1724+	global	EXTN(jconst_idct_float_sse) PRIVATE
1725
1726 EXTN(jconst_idct_float_sse):
1727
1728@@ -73,7 +73,7 @@
1729 					; FAST_FLOAT workspace[DCTSIZE2]
1730
1731 	align	16
1732-	global	EXTN(jsimd_idct_float_sse)
1733+	global	EXTN(jsimd_idct_float_sse) PRIVATE
1734
1735 EXTN(jsimd_idct_float_sse):
1736 	push	ebp
1737Index: simd/jcqnts2i-64.asm
1738===================================================================
1739--- simd/jcqnts2i-64.asm	(revision 829)
1740+++ simd/jcqnts2i-64.asm	(working copy)
1741@@ -36,7 +36,7 @@
1742 ; r12 = DCTELEM * workspace
1743
1744 	align	16
1745-	global	EXTN(jsimd_convsamp_sse2)
1746+	global	EXTN(jsimd_convsamp_sse2) PRIVATE
1747
1748 EXTN(jsimd_convsamp_sse2):
1749 	push	rbp
1750@@ -112,7 +112,7 @@
1751 ; r12 = DCTELEM * workspace
1752
1753 	align	16
1754-	global	EXTN(jsimd_quantize_sse2)
1755+	global	EXTN(jsimd_quantize_sse2) PRIVATE
1756
1757 EXTN(jsimd_quantize_sse2):
1758 	push	rbp
1759Index: simd/jdclrss2.asm
1760===================================================================
1761--- simd/jdclrss2.asm	(revision 829)
1762+++ simd/jdclrss2.asm	(working copy)
1763@@ -40,7 +40,7 @@
1764 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
1765
1766 	align	16
1767-	global	EXTN(jsimd_ycc_rgb_convert_sse2)
1768+	global	EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
1769
1770 EXTN(jsimd_ycc_rgb_convert_sse2):
1771 	push	ebp
1772Index: simd/jcqntsse.asm
1773===================================================================
1774--- simd/jcqntsse.asm	(revision 829)
1775+++ simd/jcqntsse.asm	(working copy)
1776@@ -35,7 +35,7 @@
1777 %define workspace	ebp+16		; FAST_FLOAT * workspace
1778
1779 	align	16
1780-	global	EXTN(jsimd_convsamp_float_sse)
1781+	global	EXTN(jsimd_convsamp_float_sse) PRIVATE
1782
1783 EXTN(jsimd_convsamp_float_sse):
1784 	push	ebp
1785@@ -138,7 +138,7 @@
1786 %define workspace	ebp+16		; FAST_FLOAT * workspace
1787
1788 	align	16
1789-	global	EXTN(jsimd_quantize_float_sse)
1790+	global	EXTN(jsimd_quantize_float_sse) PRIVATE
1791
1792 EXTN(jsimd_quantize_float_sse):
1793 	push	ebp
1794Index: simd/jiss2int-64.asm
1795===================================================================
1796--- simd/jiss2int-64.asm	(revision 829)
1797+++ simd/jiss2int-64.asm	(working copy)
1798@@ -67,7 +67,7 @@
1799 	SECTION	SEG_CONST
1800
1801 	alignz	16
1802-	global	EXTN(jconst_idct_islow_sse2)
1803+	global	EXTN(jconst_idct_islow_sse2) PRIVATE
1804
1805 EXTN(jconst_idct_islow_sse2):
1806
1807@@ -106,7 +106,7 @@
1808 %define WK_NUM		12
1809
1810 	align	16
1811-	global	EXTN(jsimd_idct_islow_sse2)
1812+	global	EXTN(jsimd_idct_islow_sse2) PRIVATE
1813
1814 EXTN(jsimd_idct_islow_sse2):
1815 	push	rbp
1816Index: simd/jfmmxfst.asm
1817===================================================================
1818--- simd/jfmmxfst.asm	(revision 829)
1819+++ simd/jfmmxfst.asm	(working copy)
1820@@ -52,7 +52,7 @@
1821 %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1822
1823 	alignz	16
1824-	global	EXTN(jconst_fdct_ifast_mmx)
1825+	global	EXTN(jconst_fdct_ifast_mmx) PRIVATE
1826
1827 EXTN(jconst_fdct_ifast_mmx):
1828
1829@@ -80,7 +80,7 @@
1830 %define WK_NUM		2
1831
1832 	align	16
1833-	global	EXTN(jsimd_fdct_ifast_mmx)
1834+	global	EXTN(jsimd_fdct_ifast_mmx) PRIVATE
1835
1836 EXTN(jsimd_fdct_ifast_mmx):
1837 	push	ebp
1838Index: jdarith.c
1839===================================================================
1840--- jdarith.c	(revision 829)
1841+++ jdarith.c	(working copy)
1842@@ -150,8 +150,8 @@
1843    */
1844   sv = *st;
1845   qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
1846-  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
1847-  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
1848+  nl = (unsigned char) qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
1849+  nm = (unsigned char) qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
1850
1851   /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
1852   temp = e->a - qe;
1853Index: jdhuff.c
1854===================================================================
1855--- jdhuff.c	(revision 829)
1856+++ jdhuff.c	(working copy)
1857@@ -742,7 +742,7 @@
1858  * this module, since we'll just re-assign them on the next call.)
1859  */
1860
1861-#define BUFSIZE (DCTSIZE2 * 2)
1862+#define BUFSIZE (DCTSIZE2 * 2u)
1863
1864 METHODDEF(boolean)
1865 decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
1866Index: jchuff.c
1867===================================================================
1868--- jchuff.c	(revision 1219)
1869+++ jchuff.c	(revision 1220)
1870@@ -22,8 +22,36 @@
1871 #include "jchuff.h"		/* Declarations shared with jcphuff.c */
1872 #include <limits.h>
1873
1874+/*
1875+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
1876+ * used for bit counting rather than the lookup table.  This will reduce the
1877+ * memory footprint by 64k, which is important for some mobile applications
1878+ * that create many isolated instances of libjpeg-turbo (web browsers, for
1879+ * instance.)  This may improve performance on some mobile platforms as well.
1880+ * This feature is enabled by default only on ARM processors, because some x86
1881+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
1882+ * shown to have a significant performance impact even on the x86 chips that
1883+ * have a fast implementation of it.  When building for ARMv6, you can
1884+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
1885+ * flags (this defines __thumb__).
1886+ */
1887+
1888+/* NOTE: Both GCC and Clang define __GNUC__ */
1889+#if defined __GNUC__ && defined __arm__
1890+#if !defined __thumb__ || defined __thumb2__
1891+#define USE_CLZ_INTRINSIC
1892+#endif
1893+#endif
1894+
1895+#ifdef USE_CLZ_INTRINSIC
1896+#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
1897+#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
1898+#else
1899 static unsigned char jpeg_nbits_table[65536];
1900 static int jpeg_nbits_table_init = 0;
1901+#define JPEG_NBITS(x) (jpeg_nbits_table[x])
1902+#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
1903+#endif
1904
1905 #ifndef min
1906  #define min(a,b) ((a)<(b)?(a):(b))
1907@@ -272,6 +300,7 @@
1908     dtbl->ehufsi[i] = huffsize[p];
1909   }
1910
1911+#ifndef USE_CLZ_INTRINSIC
1912   if(!jpeg_nbits_table_init) {
1913     for(i = 0; i < 65536; i++) {
1914       int nbits = 0, temp = i;
1915@@ -280,6 +309,7 @@
1916     }
1917     jpeg_nbits_table_init = 1;
1918   }
1919+#endif
1920 }
1921
1922
1923@@ -482,7 +512,7 @@
1924   temp2 += temp3;
1925
1926   /* Find the number of bits needed for the magnitude of the coefficient */
1927-  nbits = jpeg_nbits_table[temp];
1928+  nbits = JPEG_NBITS(temp);
1929
1930   /* Emit the Huffman-coded symbol for the number of bits */
1931   code = dctbl->ehufco[nbits];
1932@@ -516,7 +546,7 @@
1933     temp ^= temp3; \
1934     temp -= temp3; \
1935     temp2 += temp3; \
1936-    nbits = jpeg_nbits_table[temp]; \
1937+    nbits = JPEG_NBITS_NONZERO(temp); \
1938     /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
1939     while (r > 15) { \
1940       EMIT_BITS(code_0xf0, size_0xf0) \
1941Index: simd/jsimd_arm64.c
1942===================================================================
1943--- /dev/null
1944+++ simd/jsimd_arm64.c
1945@@ -0,0 +1,544 @@
1946+/*
1947+ * jsimd_arm64.c
1948+ *
1949+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
1950+ * Copyright 2009-2011, 2013-2014 D. R. Commander
1951+ *
1952+ * Based on the x86 SIMD extension for IJG JPEG library,
1953+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
1954+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
1955+ *
1956+ * This file contains the interface between the "normal" portions
1957+ * of the library and the SIMD implementations when running on a
1958+ * 64-bit ARM architecture.
1959+ */
1960+
1961+#define JPEG_INTERNALS
1962+#include "../jinclude.h"
1963+#include "../jpeglib.h"
1964+#include "../jsimd.h"
1965+#include "../jdct.h"
1966+#include "../jsimddct.h"
1967+#include "jsimd.h"
1968+
1969+#include <stdio.h>
1970+#include <string.h>
1971+#include <ctype.h>
1972+
1973+static unsigned int simd_support = ~0;
1974+
1975+/*
1976+ * Check what SIMD accelerations are supported.
1977+ *
1978+ * FIXME: This code is racy under a multi-threaded environment.
1979+ */
1980+
1981+/*
1982+ * ARMv8 architectures support NEON extensions by default.
1983+ * It is no longer optional as it was with ARMv7.
1984+ */
1985+
1986+
1987+LOCAL(void)
1988+init_simd (void)
1989+{
1990+  char *env = NULL;
1991+
1992+  if (simd_support != ~0U)
1993+    return;
1994+
1995+  simd_support = 0;
1996+
1997+  simd_support |= JSIMD_ARM_NEON;
1998+
1999+  /* Force different settings through environment variables */
2000+  env = getenv("JSIMD_FORCENEON");
2001+  if ((env != NULL) && (strcmp(env, "1") == 0))
2002+    simd_support &= JSIMD_ARM_NEON;
2003+  env = getenv("JSIMD_FORCENONE");
2004+  if ((env != NULL) && (strcmp(env, "1") == 0))
2005+    simd_support = 0;
2006+}
2007+
2008+GLOBAL(int)
2009+jsimd_can_rgb_ycc (void)
2010+{
2011+  init_simd();
2012+
2013+  return 0;
2014+}
2015+
2016+GLOBAL(int)
2017+jsimd_can_rgb_gray (void)
2018+{
2019+  init_simd();
2020+
2021+  return 0;
2022+}
2023+
2024+GLOBAL(int)
2025+jsimd_can_ycc_rgb (void)
2026+{
2027+  init_simd();
2028+
2029+  /* The code is optimised for these values only */
2030+  if (BITS_IN_JSAMPLE != 8)
2031+    return 0;
2032+  if (sizeof(JDIMENSION) != 4)
2033+    return 0;
2034+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
2035+    return 0;
2036+
2037+  if (simd_support & JSIMD_ARM_NEON)
2038+    return 1;
2039+
2040+  return 0;
2041+}
2042+
2043+GLOBAL(int)
2044+jsimd_can_ycc_rgb565 (void)
2045+{
2046+  init_simd();
2047+
2048+  /* The code is optimised for these values only */
2049+  if (BITS_IN_JSAMPLE != 8)
2050+    return 0;
2051+  if (sizeof(JDIMENSION) != 4)
2052+    return 0;
2053+
2054+  if (simd_support & JSIMD_ARM_NEON)
2055+    return 1;
2056+
2057+  return 0;
2058+}
2059+
2060+GLOBAL(void)
2061+jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
2062+                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
2063+                       JDIMENSION output_row, int num_rows)
2064+{
2065+}
2066+
2067+GLOBAL(void)
2068+jsimd_rgb_gray_convert (j_compress_ptr cinfo,
2069+                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
2070+                        JDIMENSION output_row, int num_rows)
2071+{
2072+}
2073+
2074+GLOBAL(void)
2075+jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
2076+                       JSAMPIMAGE input_buf, JDIMENSION input_row,
2077+                       JSAMPARRAY output_buf, int num_rows)
2078+{
2079+  void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
2080+
2081+  switch(cinfo->out_color_space) {
2082+    case JCS_EXT_RGB:
2083+      neonfct=jsimd_ycc_extrgb_convert_neon;
2084+      break;
2085+    case JCS_EXT_RGBX:
2086+    case JCS_EXT_RGBA:
2087+      neonfct=jsimd_ycc_extrgbx_convert_neon;
2088+      break;
2089+    case JCS_EXT_BGR:
2090+      neonfct=jsimd_ycc_extbgr_convert_neon;
2091+      break;
2092+    case JCS_EXT_BGRX:
2093+    case JCS_EXT_BGRA:
2094+      neonfct=jsimd_ycc_extbgrx_convert_neon;
2095+      break;
2096+    case JCS_EXT_XBGR:
2097+    case JCS_EXT_ABGR:
2098+      neonfct=jsimd_ycc_extxbgr_convert_neon;
2099+      break;
2100+    case JCS_EXT_XRGB:
2101+    case JCS_EXT_ARGB:
2102+      neonfct=jsimd_ycc_extxrgb_convert_neon;
2103+      break;
2104+    default:
2105+      neonfct=jsimd_ycc_extrgb_convert_neon;
2106+      break;
2107+  }
2108+
2109+  if (simd_support & JSIMD_ARM_NEON)
2110+    neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
2111+}
2112+
2113+GLOBAL(void)
2114+jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
2115+                          JSAMPIMAGE input_buf, JDIMENSION input_row,
2116+                          JSAMPARRAY output_buf, int num_rows)
2117+{
2118+  if (simd_support & JSIMD_ARM_NEON)
2119+    jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
2120+                                  output_buf, num_rows);
2121+}
2122+
2123+GLOBAL(int)
2124+jsimd_can_h2v2_downsample (void)
2125+{
2126+  init_simd();
2127+
2128+  return 0;
2129+}
2130+
2131+GLOBAL(int)
2132+jsimd_can_h2v1_downsample (void)
2133+{
2134+  init_simd();
2135+
2136+  return 0;
2137+}
2138+
2139+GLOBAL(void)
2140+jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
2141+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
2142+{
2143+}
2144+
2145+GLOBAL(void)
2146+jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
2147+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
2148+{
2149+}
2150+
2151+GLOBAL(int)
2152+jsimd_can_h2v2_upsample (void)
2153+{
2154+  init_simd();
2155+
2156+  return 0;
2157+}
2158+
2159+GLOBAL(int)
2160+jsimd_can_h2v1_upsample (void)
2161+{
2162+  init_simd();
2163+
2164+  return 0;
2165+}
2166+
2167+GLOBAL(void)
2168+jsimd_h2v2_upsample (j_decompress_ptr cinfo,
2169+                     jpeg_component_info * compptr,
2170+                     JSAMPARRAY input_data,
2171+                     JSAMPARRAY * output_data_ptr)
2172+{
2173+}
2174+
2175+GLOBAL(void)
2176+jsimd_h2v1_upsample (j_decompress_ptr cinfo,
2177+                     jpeg_component_info * compptr,
2178+                     JSAMPARRAY input_data,
2179+                     JSAMPARRAY * output_data_ptr)
2180+{
2181+}
2182+
2183+GLOBAL(int)
2184+jsimd_can_h2v2_fancy_upsample (void)
2185+{
2186+  init_simd();
2187+
2188+  return 0;
2189+}
2190+
2191+GLOBAL(int)
2192+jsimd_can_h2v1_fancy_upsample (void)
2193+{
2194+  init_simd();
2195+
2196+  return 0;
2197+}
2198+
2199+GLOBAL(void)
2200+jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
2201+                           jpeg_component_info * compptr,
2202+                           JSAMPARRAY input_data,
2203+                           JSAMPARRAY * output_data_ptr)
2204+{
2205+}
2206+
2207+GLOBAL(void)
2208+jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
2209+                           jpeg_component_info * compptr,
2210+                           JSAMPARRAY input_data,
2211+                           JSAMPARRAY * output_data_ptr)
2212+{
2213+}
2214+
2215+GLOBAL(int)
2216+jsimd_can_h2v2_merged_upsample (void)
2217+{
2218+  init_simd();
2219+
2220+  return 0;
2221+}
2222+
2223+GLOBAL(int)
2224+jsimd_can_h2v1_merged_upsample (void)
2225+{
2226+  init_simd();
2227+
2228+  return 0;
2229+}
2230+
2231+GLOBAL(void)
2232+jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
2233+                            JSAMPIMAGE input_buf,
2234+                            JDIMENSION in_row_group_ctr,
2235+                            JSAMPARRAY output_buf)
2236+{
2237+}
2238+
2239+GLOBAL(void)
2240+jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
2241+                            JSAMPIMAGE input_buf,
2242+                            JDIMENSION in_row_group_ctr,
2243+                            JSAMPARRAY output_buf)
2244+{
2245+}
2246+
2247+GLOBAL(int)
2248+jsimd_can_convsamp (void)
2249+{
2250+  init_simd();
2251+
2252+  return 0;
2253+}
2254+
2255+GLOBAL(int)
2256+jsimd_can_convsamp_float (void)
2257+{
2258+  init_simd();
2259+
2260+  return 0;
2261+}
2262+
2263+GLOBAL(void)
2264+jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
2265+                DCTELEM * workspace)
2266+{
2267+}
2268+
2269+GLOBAL(void)
2270+jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
2271+                      FAST_FLOAT * workspace)
2272+{
2273+}
2274+
2275+GLOBAL(int)
2276+jsimd_can_fdct_islow (void)
2277+{
2278+  init_simd();
2279+
2280+  return 0;
2281+}
2282+
2283+GLOBAL(int)
2284+jsimd_can_fdct_ifast (void)
2285+{
2286+  init_simd();
2287+
2288+  return 0;
2289+}
2290+
2291+GLOBAL(int)
2292+jsimd_can_fdct_float (void)
2293+{
2294+  init_simd();
2295+
2296+  return 0;
2297+}
2298+
2299+GLOBAL(void)
2300+jsimd_fdct_islow (DCTELEM * data)
2301+{
2302+}
2303+
2304+GLOBAL(void)
2305+jsimd_fdct_ifast (DCTELEM * data)
2306+{
2307+}
2308+
2309+GLOBAL(void)
2310+jsimd_fdct_float (FAST_FLOAT * data)
2311+{
2312+}
2313+
2314+GLOBAL(int)
2315+jsimd_can_quantize (void)
2316+{
2317+  init_simd();
2318+
2319+  return 0;
2320+}
2321+
2322+GLOBAL(int)
2323+jsimd_can_quantize_float (void)
2324+{
2325+  init_simd();
2326+
2327+  return 0;
2328+}
2329+
2330+GLOBAL(void)
2331+jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
2332+                DCTELEM * workspace)
2333+{
2334+}
2335+
2336+GLOBAL(void)
2337+jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
2338+                      FAST_FLOAT * workspace)
2339+{
2340+}
2341+
2342+GLOBAL(int)
2343+jsimd_can_idct_2x2 (void)
2344+{
2345+  init_simd();
2346+
2347+  /* The code is optimised for these values only */
2348+  if (DCTSIZE != 8)
2349+    return 0;
2350+  if (sizeof(JCOEF) != 2)
2351+    return 0;
2352+  if (BITS_IN_JSAMPLE != 8)
2353+    return 0;
2354+  if (sizeof(JDIMENSION) != 4)
2355+    return 0;
2356+  if (sizeof(ISLOW_MULT_TYPE) != 2)
2357+    return 0;
2358+
2359+  if (simd_support & JSIMD_ARM_NEON)
2360+    return 1;
2361+
2362+  return 0;
2363+}
2364+
2365+GLOBAL(int)
2366+jsimd_can_idct_4x4 (void)
2367+{
2368+  init_simd();
2369+
2370+  /* The code is optimised for these values only */
2371+  if (DCTSIZE != 8)
2372+    return 0;
2373+  if (sizeof(JCOEF) != 2)
2374+    return 0;
2375+  if (BITS_IN_JSAMPLE != 8)
2376+    return 0;
2377+  if (sizeof(JDIMENSION) != 4)
2378+    return 0;
2379+  if (sizeof(ISLOW_MULT_TYPE) != 2)
2380+    return 0;
2381+
2382+  if (simd_support & JSIMD_ARM_NEON)
2383+    return 1;
2384+
2385+  return 0;
2386+}
2387+
2388+GLOBAL(void)
2389+jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2390+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
2391+                JDIMENSION output_col)
2392+{
2393+  if (simd_support & JSIMD_ARM_NEON)
2394+    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
2395+                        output_col);
2396+}
2397+
2398+GLOBAL(void)
2399+jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2400+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
2401+                JDIMENSION output_col)
2402+{
2403+  if (simd_support & JSIMD_ARM_NEON)
2404+    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
2405+                        output_col);
2406+}
2407+
2408+GLOBAL(int)
2409+jsimd_can_idct_islow (void)
2410+{
2411+  init_simd();
2412+
2413+  /* The code is optimised for these values only */
2414+  if (DCTSIZE != 8)
2415+    return 0;
2416+  if (sizeof(JCOEF) != 2)
2417+    return 0;
2418+  if (BITS_IN_JSAMPLE != 8)
2419+    return 0;
2420+  if (sizeof(JDIMENSION) != 4)
2421+    return 0;
2422+  if (sizeof(ISLOW_MULT_TYPE) != 2)
2423+    return 0;
2424+
2425+  if (simd_support & JSIMD_ARM_NEON)
2426+    return 1;
2427+
2428+  return 0;
2429+}
2430+
2431+GLOBAL(int)
2432+jsimd_can_idct_ifast (void)
2433+{
2434+  init_simd();
2435+
2436+  /* The code is optimised for these values only */
2437+  if (DCTSIZE != 8)
2438+    return 0;
2439+  if (sizeof(JCOEF) != 2)
2440+    return 0;
2441+  if (BITS_IN_JSAMPLE != 8)
2442+    return 0;
2443+  if (sizeof(JDIMENSION) != 4)
2444+    return 0;
2445+  if (sizeof(IFAST_MULT_TYPE) != 2)
2446+    return 0;
2447+  if (IFAST_SCALE_BITS != 2)
2448+    return 0;
2449+
2450+  if (simd_support & JSIMD_ARM_NEON)
2451+    return 1;
2452+
2453+  return 0;
2454+}
2455+
2456+GLOBAL(int)
2457+jsimd_can_idct_float (void)
2458+{
2459+  init_simd();
2460+
2461+  return 0;
2462+}
2463+
2464+GLOBAL(void)
2465+jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2466+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
2467+                  JDIMENSION output_col)
2468+{
2469+  if (simd_support & JSIMD_ARM_NEON)
2470+    jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
2471+                          output_col);
2472+}
2473+
2474+GLOBAL(void)
2475+jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2476+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
2477+                  JDIMENSION output_col)
2478+{
2479+  if (simd_support & JSIMD_ARM_NEON)
2480+    jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
2481+                          output_col);
2482+}
2483+
2484+GLOBAL(void)
2485+jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2486+                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
2487+                  JDIMENSION output_col)
2488+{
2489+}
2490Index: simd/jsimd_arm64_neon.S
2491new file mode 100644
2492===================================================================
2493--- /dev/null
2494+++ simd/jsimd_arm64_neon.S
2495@@ -0,0 +1,1861 @@
2496+/*
2497+ * ARMv8 NEON optimizations for libjpeg-turbo
2498+ *
2499+ * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
2500+ * All rights reserved.
2501+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
2502+ * Copyright (C) 2013-2014, Linaro Limited
2503+ * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
2504+ *
2505+ * This software is provided 'as-is', without any express or implied
2506+ * warranty.  In no event will the authors be held liable for any damages
2507+ * arising from the use of this software.
2508+ *
2509+ * Permission is granted to anyone to use this software for any purpose,
2510+ * including commercial applications, and to alter it and redistribute it
2511+ * freely, subject to the following restrictions:
2512+ *
2513+ * 1. The origin of this software must not be misrepresented; you must not
2514+ *    claim that you wrote the original software. If you use this software
2515+ *    in a product, an acknowledgment in the product documentation would be
2516+ *    appreciated but is not required.
2517+ * 2. Altered source versions must be plainly marked as such, and must not be
2518+ *    misrepresented as being the original software.
2519+ * 3. This notice may not be removed or altered from any source distribution.
2520+ */
2521+
2522+#if defined(__linux__) && defined(__ELF__)
2523+.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
2524+#endif
2525+
2526+.text
2527+.arch armv8-a+fp+simd
2528+
2529+
2530+#define RESPECT_STRICT_ALIGNMENT 1
2531+
2532+
2533+/*****************************************************************************/
2534+
2535+/* Supplementary macro for setting function attributes */
2536+.macro asm_function fname
2537+#ifdef __APPLE__
2538+    .globl _\fname
2539+_\fname:
2540+#else
2541+    .global \fname
2542+#ifdef __ELF__
2543+    .hidden \fname
2544+    .type \fname, %function
2545+#endif
2546+\fname:
2547+#endif
2548+.endm
2549+
2550+/* Transpose elements of single 128 bit registers */
2551+.macro transpose_single x0,x1,xi,xilen,literal
2552+    ins  \xi\xilen[0],  \x0\xilen[0]
2553+    ins  \x1\xilen[0],  \x0\xilen[1]
2554+    trn1 \x0\literal,   \x0\literal, \x1\literal
2555+    trn2 \x1\literal,   \xi\literal, \x1\literal
2556+.endm
2557+
2558+/* Transpose elements of 2 differnet registers */
2559+.macro transpose x0,x1,xi,xilen,literal
2560+    mov  \xi\xilen,     \x0\xilen
2561+    trn1 \x0\literal,   \x0\literal, \x1\literal
2562+    trn2 \x1\literal,   \xi\literal, \x1\literal
2563+.endm
2564+
2565+/* Transpose a block of 4x4 coefficients in four 64-bit registers */
2566+.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
2567+    mov  \xi\xilen, \x0\xilen
2568+    trn1 \x0\x0len, \x0\x0len, \x2\x2len
2569+    trn2 \x2\x2len, \xi\x0len, \x2\x2len
2570+    mov  \xi\xilen, \x1\xilen
2571+    trn1 \x1\x1len, \x1\x1len, \x3\x3len
2572+    trn2 \x3\x3len, \xi\x1len, \x3\x3len
2573+.endm
2574+
2575+.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
2576+    mov  \xi\xilen, \x0\xilen
2577+    trn1 \x0\x0len, \x0\x0len, \x1\x1len
2578+    trn2 \x1\x2len, \xi\x0len, \x1\x2len
2579+    mov  \xi\xilen, \x2\xilen
2580+    trn1 \x2\x2len, \x2\x2len, \x3\x3len
2581+    trn2 \x3\x2len, \xi\x1len, \x3\x3len
2582+.endm
2583+
2584+.macro transpose_4x4 x0, x1, x2, x3,x5
2585+    transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
2586+    transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
2587+.endm
2588+
2589+
2590+#define CENTERJSAMPLE 128
2591+
2592+/*****************************************************************************/
2593+
2594+/*
2595+ * Perform dequantization and inverse DCT on one block of coefficients.
2596+ *
2597+ * GLOBAL(void)
2598+ * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
2599+ *                        JSAMPARRAY output_buf, JDIMENSION output_col)
2600+ */
2601+
2602+#define FIX_0_298631336  (2446)
2603+#define FIX_0_390180644  (3196)
2604+#define FIX_0_541196100  (4433)
2605+#define FIX_0_765366865  (6270)
2606+#define FIX_0_899976223  (7373)
2607+#define FIX_1_175875602  (9633)
2608+#define FIX_1_501321110  (12299)
2609+#define FIX_1_847759065  (15137)
2610+#define FIX_1_961570560  (16069)
2611+#define FIX_2_053119869  (16819)
2612+#define FIX_2_562915447  (20995)
2613+#define FIX_3_072711026  (25172)
2614+
2615+#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
2616+#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
2617+#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
2618+#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
2619+#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
2620+#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
2621+#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
2622+#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
2623+
2624+/*
2625+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
2626+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
2627+ */
2628+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
2629+{                                                                             \
2630+    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
2631+    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
2632+    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
2633+                                                                              \
2634+    /* 1-D iDCT input data */                                                 \
2635+    row0 = xrow0;                                                             \
2636+    row1 = xrow1;                                                             \
2637+    row2 = xrow2;                                                             \
2638+    row3 = xrow3;                                                             \
2639+    row4 = xrow4;                                                             \
2640+    row5 = xrow5;                                                             \
2641+    row6 = xrow6;                                                             \
2642+    row7 = xrow7;                                                             \
2643+                                                                              \
2644+    q5 = row7 + row3;                                                         \
2645+    q4 = row5 + row1;                                                         \
2646+    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
2647+         MULTIPLY(q4, FIX_1_175875602);                                       \
2648+    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
2649+         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
2650+    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
2651+         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
2652+    q4 = q6;                                                                  \
2653+    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
2654+    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
2655+          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
2656+    /* now we can use q1 (reloadable constants have been used up) */          \
2657+    q1 = q3 + q2;                                                             \
2658+    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
2659+          MULTIPLY(row1, -FIX_0_899976223);                                   \
2660+    q5 = q7;                                                                  \
2661+    q1 = q1 + q6;                                                             \
2662+    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
2663+          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
2664+                                                                              \
2665+    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
2666+    tmp11_plus_tmp2 = q1;                                                     \
2667+    row1 = 0;                                                                 \
2668+                                                                              \
2669+    q1 = q1 - q6;                                                             \
2670+    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
2671+          MULTIPLY(row3, -FIX_2_562915447);                                   \
2672+    q1 = q1 - q6;                                                             \
2673+    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
2674+         MULTIPLY(row6, FIX_0_541196100);                                     \
2675+    q3 = q3 - q2;                                                             \
2676+                                                                              \
2677+    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
2678+    tmp11_minus_tmp2 = q1;                                                    \
2679+                                                                              \
2680+    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
2681+    q2 = q1 + q6;                                                             \
2682+    q1 = q1 - q6;                                                             \
2683+                                                                              \
2684+    /* pick up the results */                                                 \
2685+    tmp0  = q4;                                                               \
2686+    tmp1  = q5;                                                               \
2687+    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
2688+    tmp3  = q7;                                                               \
2689+    tmp10 = q2;                                                               \
2690+    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
2691+    tmp12 = q3;                                                               \
2692+    tmp13 = q1;                                                               \
2693+}
2694+
2695+#define XFIX_0_899976223                    v0.4h[0]
2696+#define XFIX_0_541196100                    v0.4h[1]
2697+#define XFIX_2_562915447                    v0.4h[2]
2698+#define XFIX_0_298631336_MINUS_0_899976223  v0.4h[3]
2699+#define XFIX_1_501321110_MINUS_0_899976223  v1.4h[0]
2700+#define XFIX_2_053119869_MINUS_2_562915447  v1.4h[1]
2701+#define XFIX_0_541196100_PLUS_0_765366865   v1.4h[2]
2702+#define XFIX_1_175875602                    v1.4h[3]
2703+#define XFIX_1_175875602_MINUS_0_390180644  v2.4h[0]
2704+#define XFIX_0_541196100_MINUS_1_847759065  v2.4h[1]
2705+#define XFIX_3_072711026_MINUS_2_562915447  v2.4h[2]
2706+#define XFIX_1_175875602_MINUS_1_961570560  v2.4h[3]
2707+
2708+.balign 16
2709+jsimd_idct_islow_neon_consts:
2710+    .short FIX_0_899976223                    /* d0[0] */
2711+    .short FIX_0_541196100                    /* d0[1] */
2712+    .short FIX_2_562915447                    /* d0[2] */
2713+    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
2714+    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
2715+    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
2716+    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
2717+    .short FIX_1_175875602                    /* d1[3] */
2718+    /* reloadable constants */
2719+    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
2720+    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
2721+    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
2722+    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
2723+
2724+asm_function jsimd_idct_islow_neon
2725+
2726+    DCT_TABLE       .req x0
2727+    COEF_BLOCK      .req x1
2728+    OUTPUT_BUF      .req x2
2729+    OUTPUT_COL      .req x3
2730+    TMP1            .req x0
2731+    TMP2            .req x1
2732+    TMP3            .req x2
2733+    TMP4            .req x15
2734+
2735+    ROW0L           .req v16
2736+    ROW0R           .req v17
2737+    ROW1L           .req v18
2738+    ROW1R           .req v19
2739+    ROW2L           .req v20
2740+    ROW2R           .req v21
2741+    ROW3L           .req v22
2742+    ROW3R           .req v23
2743+    ROW4L           .req v24
2744+    ROW4R           .req v25
2745+    ROW5L           .req v26
2746+    ROW5R           .req v27
2747+    ROW6L           .req v28
2748+    ROW6R           .req v29
2749+    ROW7L           .req v30
2750+    ROW7R           .req v31
2751+    /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
2752+    sub             sp, sp, 272
2753+    str             x15, [sp], 16
2754+    adr             x15, jsimd_idct_islow_neon_consts
2755+    st1             {v0.8b - v3.8b}, [sp], 32
2756+    st1             {v4.8b - v7.8b}, [sp], 32
2757+    st1             {v8.8b - v11.8b}, [sp], 32
2758+    st1             {v12.8b - v15.8b}, [sp], 32
2759+    st1             {v16.8b - v19.8b}, [sp], 32
2760+    st1             {v20.8b - v23.8b}, [sp], 32
2761+    st1             {v24.8b - v27.8b}, [sp], 32
2762+    st1             {v28.8b - v31.8b}, [sp], 32
2763+    ld1             {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
2764+    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
2765+    ld1             {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
2766+    mul             v16.4h, v16.4h, v0.4h
2767+    mul             v17.4h, v17.4h, v1.4h
2768+    ins             v16.2d[1], v17.2d[0]  /* 128 bit q8 */
2769+    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
2770+    mul             v18.4h, v18.4h, v2.4h
2771+    mul             v19.4h, v19.4h, v3.4h
2772+    ins             v18.2d[1], v19.2d[0]  /* 128 bit q9 */
2773+    ld1             {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
2774+    mul             v20.4h, v20.4h, v4.4h
2775+    mul             v21.4h, v21.4h, v5.4h
2776+    ins             v20.2d[1], v21.2d[0]  /* 128 bit q10 */
2777+    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
2778+    mul             v22.4h, v22.4h, v6.4h
2779+    mul             v23.4h, v23.4h, v7.4h
2780+    ins             v22.2d[1], v23.2d[0]  /* 128 bit q11 */
2781+    ld1             {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
2782+    mul             v24.4h, v24.4h, v0.4h
2783+    mul             v25.4h, v25.4h, v1.4h
2784+    ins             v24.2d[1], v25.2d[0]  /* 128 bit q12 */
2785+    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
2786+    mul             v28.4h, v28.4h, v4.4h
2787+    mul             v29.4h, v29.4h, v5.4h
2788+    ins             v28.2d[1], v29.2d[0]  /* 128 bit q14 */
2789+    mul             v26.4h, v26.4h, v2.4h
2790+    mul             v27.4h, v27.4h, v3.4h
2791+    ins             v26.2d[1], v27.2d[0]  /* 128 bit q13 */
2792+    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x15]  /* load constants */
2793+    add             x15, x15, #16
2794+    mul             v30.4h, v30.4h, v6.4h
2795+    mul             v31.4h, v31.4h, v7.4h
2796+    ins             v30.2d[1], v31.2d[0]  /* 128 bit q15 */
2797+    /* Go to the bottom of the stack */
2798+    sub             sp, sp, 352
2799+    stp             x4, x5, [sp], 16
2800+    st1             {v8.4h - v11.4h}, [sp], 32  /* save NEON registers */
2801+    st1             {v12.4h - v15.4h}, [sp], 32
2802+    /* 1-D IDCT, pass 1, left 4x8 half */
2803+    add             v4.4h,    ROW7L.4h, ROW3L.4h
2804+    add             v5.4h,    ROW5L.4h, ROW1L.4h
2805+    smull           v12.4s,   v4.4h,    XFIX_1_175875602_MINUS_1_961570560
2806+    smlal           v12.4s,   v5.4h,    XFIX_1_175875602
2807+    smull           v14.4s,   v4.4h,    XFIX_1_175875602
2808+    /* Check for the zero coefficients in the right 4x8 half */
2809+    smlal           v14.4s,   v5.4h,    XFIX_1_175875602_MINUS_0_390180644
2810+    ssubl           v6.4s,    ROW0L.4h, ROW4L.4h
2811+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
2812+    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
2813+    smlal           v4.4s,    ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
2814+      orr           x0,       x4,       x5
2815+    mov             v8.16b,   v12.16b
2816+    smlsl           v12.4s,   ROW5L.4h, XFIX_2_562915447
2817+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
2818+    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
2819+    shl             v6.4s,    v6.4s,    #13
2820+      orr           x0,       x0,       x4
2821+    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
2822+      orr           x0,       x0 ,      x5
2823+    add             v2.4s,    v6.4s,    v4.4s
2824+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
2825+    mov             v10.16b,  v14.16b
2826+    add             v2.4s,    v2.4s,    v12.4s
2827+      orr           x0,       x0,       x4
2828+    smlsl           v14.4s,   ROW7L.4h, XFIX_0_899976223
2829+      orr           x0,       x0,       x5
2830+    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
2831+    rshrn           ROW1L.4h, v2.4s,    #11
2832+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
2833+    sub             v2.4s,    v2.4s,    v12.4s
2834+    smlal           v10.4s,   ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
2835+      orr           x0,       x0,       x4
2836+    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
2837+      orr           x0,       x0,       x5
2838+    sub             v2.4s,    v2.4s,    v12.4s
2839+    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
2840+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
2841+    smlal           v12.4s,   ROW6L.4h, XFIX_0_541196100
2842+    sub             v6.4s,    v6.4s,    v4.4s
2843+      orr           x0,       x0,       x4
2844+    rshrn           ROW6L.4h, v2.4s,    #11
2845+      orr           x0,       x0,       x5
2846+    add             v2.4s,    v6.4s,    v10.4s
2847+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
2848+    sub             v6.4s,    v6.4s,    v10.4s
2849+    saddl           v10.4s,   ROW0L.4h, ROW4L.4h
2850+      orr           x0,       x0,       x4
2851+    rshrn           ROW2L.4h, v2.4s,    #11
2852+      orr           x0,       x0,       x5
2853+    rshrn           ROW5L.4h, v6.4s,    #11
2854+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
2855+    shl             v10.4s,   v10.4s,   #13
2856+    smlal           v8.4s,    ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
2857+      orr           x0,       x0,       x4
2858+    add             v4.4s,    v10.4s,   v12.4s
2859+      orr           x0,       x0,       x5
2860+    cmp             x0, #0 /* orrs instruction removed */
2861+    sub             v2.4s,    v10.4s,   v12.4s
2862+    add             v12.4s,   v4.4s,    v14.4s
2863+      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
2864+    sub             v4.4s,    v4.4s,    v14.4s
2865+    add             v10.4s,   v2.4s,    v8.4s
2866+      orr           x0,       x4,       x5
2867+    sub             v6.4s,    v2.4s,    v8.4s
2868+      /* pop             {x4, x5} */
2869+      sub           sp, sp, 80
2870+      ldp           x4, x5, [sp], 16
2871+    rshrn           ROW7L.4h, v4.4s,    #11
2872+    rshrn           ROW3L.4h, v10.4s,   #11
2873+    rshrn           ROW0L.4h, v12.4s,   #11
2874+    rshrn           ROW4L.4h, v6.4s,    #11
2875+
2876+      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
2877+
2878+    /* 1-D IDCT, pass 1, right 4x8 half */
2879+    ld1             {v2.4h},  [x15]    /* reload constants */
2880+    add             v10.4h,   ROW7R.4h, ROW3R.4h
2881+    add             v8.4h,    ROW5R.4h, ROW1R.4h
2882+    /* Transpose ROW6L <-> ROW7L   (v3 available free register) */
2883+    transpose       ROW6L, ROW7L, v3, .16b, .4h
2884+    smull           v12.4s,   v10.4h,   XFIX_1_175875602_MINUS_1_961570560
2885+    smlal           v12.4s,   v8.4h,    XFIX_1_175875602
2886+    /* Transpose ROW2L <-> ROW3L   (v3 available free register) */
2887+    transpose       ROW2L, ROW3L, v3, .16b, .4h
2888+    smull           v14.4s,   v10.4h,   XFIX_1_175875602
2889+    smlal           v14.4s,   v8.4h,    XFIX_1_175875602_MINUS_0_390180644
2890+    /* Transpose ROW0L <-> ROW1L   (v3 available free register) */
2891+    transpose       ROW0L, ROW1L, v3, .16b, .4h
2892+    ssubl           v6.4s,    ROW0R.4h, ROW4R.4h
2893+    smull           v4.4s,    ROW2R.4h, XFIX_0_541196100
2894+    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
2895+    /* Transpose ROW4L <-> ROW5L   (v3 available free register) */
2896+    transpose       ROW4L, ROW5L, v3, .16b, .4h
2897+    mov             v8.16b,   v12.16b
2898+    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
2899+    smlal           v12.4s,   ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
2900+    /* Transpose ROW1L <-> ROW3L   (v3 available free register) */
2901+    transpose       ROW1L, ROW3L, v3, .16b, .2s
2902+    shl             v6.4s,    v6.4s,    #13
2903+    smlsl           v8.4s,    ROW1R.4h, XFIX_0_899976223
2904+    /* Transpose ROW4L <-> ROW6L   (v3 available free register) */
2905+    transpose       ROW4L, ROW6L, v3, .16b, .2s
2906+    add             v2.4s,    v6.4s,    v4.4s
2907+    mov             v10.16b,  v14.16b
2908+    add             v2.4s,    v2.4s,    v12.4s
2909+    /* Transpose ROW0L <-> ROW2L   (v3 available free register) */
2910+    transpose       ROW0L, ROW2L, v3, .16b, .2s
2911+    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
2912+    smlal           v14.4s,   ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
2913+    rshrn           ROW1R.4h, v2.4s,    #11
2914+    /* Transpose ROW5L <-> ROW7L   (v3 available free register) */
2915+    transpose       ROW5L, ROW7L, v3, .16b, .2s
2916+    sub             v2.4s,    v2.4s,    v12.4s
2917+    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
2918+    smlsl           v10.4s,   ROW3R.4h, XFIX_2_562915447
2919+    sub             v2.4s,    v2.4s,    v12.4s
2920+    smull           v12.4s,   ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
2921+    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
2922+    sub             v6.4s,    v6.4s,    v4.4s
2923+    rshrn           ROW6R.4h, v2.4s,    #11
2924+    add             v2.4s,    v6.4s,    v10.4s
2925+    sub             v6.4s,    v6.4s,    v10.4s
2926+    saddl           v10.4s,   ROW0R.4h, ROW4R.4h
2927+    rshrn           ROW2R.4h, v2.4s,    #11
2928+    rshrn           ROW5R.4h, v6.4s,    #11
2929+    shl             v10.4s,   v10.4s,   #13
2930+    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
2931+    add             v4.4s,    v10.4s,   v12.4s
2932+    sub             v2.4s,    v10.4s,   v12.4s
2933+    add             v12.4s,   v4.4s,    v14.4s
2934+    sub             v4.4s,    v4.4s,    v14.4s
2935+    add             v10.4s,   v2.4s,    v8.4s
2936+    sub             v6.4s,    v2.4s,    v8.4s
2937+    rshrn           ROW7R.4h, v4.4s,    #11
2938+    rshrn           ROW3R.4h, v10.4s,   #11
2939+    rshrn           ROW0R.4h, v12.4s,   #11
2940+    rshrn           ROW4R.4h, v6.4s,    #11
2941+    /* Transpose right 4x8 half */
2942+    transpose       ROW6R, ROW7R, v3, .16b, .4h
2943+    transpose       ROW2R, ROW3R, v3, .16b, .4h
2944+    transpose       ROW0R, ROW1R, v3, .16b, .4h
2945+    transpose       ROW4R, ROW5R, v3, .16b, .4h
2946+    transpose       ROW1R, ROW3R, v3, .16b, .2s
2947+    transpose       ROW4R, ROW6R, v3, .16b, .2s
2948+    transpose       ROW0R, ROW2R, v3, .16b, .2s
2949+    transpose       ROW5R, ROW7R, v3, .16b, .2s
2950+
2951+1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
2952+    ld1             {v2.4h},  [x15]    /* reload constants */
2953+    smull           v12.4S,   ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
2954+    smlal           v12.4s,   ROW1L.4h, XFIX_1_175875602
2955+    smlal           v12.4s,   ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
2956+    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
2957+    smull           v14.4s,   ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
2958+    smlal           v14.4s,   ROW3L.4h, XFIX_1_175875602
2959+    smlal           v14.4s,   ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
2960+    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
2961+    ssubl           v6.4s,    ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
2962+    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
2963+    smlal           v4.4s,    ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L.4h <-> ROW2R.4h */
2964+    mov             v8.16b,   v12.16b
2965+    smlsl           v12.4s,   ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
2966+    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
2967+    shl             v6.4s,    v6.4s,    #13
2968+    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
2969+    add             v2.4s,    v6.4s,    v4.4s
2970+    mov             v10.16b,  v14.16b
2971+    add             v2.4s,    v2.4s,    v12.4s
2972+    smlsl           v14.4s,   ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
2973+    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
2974+    shrn            ROW1L.4h, v2.4s,    #16
2975+    sub             v2.4s,    v2.4s,    v12.4s
2976+    smlal           v10.4s,   ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
2977+    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
2978+    sub             v2.4s,    v2.4s,    v12.4s
2979+    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
2980+    smlal           v12.4s,   ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
2981+    sub             v6.4s,    v6.4s,    v4.4s
2982+    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
2983+    add             v2.4s,    v6.4s,    v10.4s
2984+    sub             v6.4s,    v6.4s,    v10.4s
2985+    saddl           v10.4s,   ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
2986+    shrn            ROW2L.4h, v2.4s,    #16
2987+    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
2988+    shl             v10.4s,   v10.4s,   #13
2989+    smlal           v8.4s,    ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
2990+    add             v4.4s,    v10.4s,   v12.4s
2991+    sub             v2.4s,    v10.4s,   v12.4s
2992+    add             v12.4s,   v4.4s,    v14.4s
2993+    sub             v4.4s,    v4.4s,    v14.4s
2994+    add             v10.4s,   v2.4s,    v8.4s
2995+    sub             v6.4s,    v2.4s,    v8.4s
2996+    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
2997+    shrn            ROW3L.4h, v10.4s,   #16
2998+    shrn            ROW0L.4h, v12.4s,   #16
2999+    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
3000+    /* 1-D IDCT, pass 2, right 4x8 half */
3001+    ld1             {v2.4h},  [x15]    /* reload constants */
3002+    smull           v12.4s,   ROW5R.4h, XFIX_1_175875602
3003+    smlal           v12.4s,   ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
3004+    smlal           v12.4s,   ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
3005+    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
3006+    smull           v14.4s,   ROW7R.4h, XFIX_1_175875602
3007+    smlal           v14.4s,   ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
3008+    smlal           v14.4s,   ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
3009+    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
3010+    ssubl           v6.4s,    ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
3011+    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
3012+    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
3013+    mov             v8.16b,   v12.16b
3014+    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
3015+    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
3016+    shl             v6.4s,    v6.4s,    #13
3017+    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
3018+    add             v2.4s,    v6.4s,    v4.4s
3019+    mov             v10.16b,  v14.16b
3020+    add             v2.4s,    v2.4s,    v12.4s
3021+    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
3022+    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
3023+    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
3024+    sub             v2.4s,    v2.4s,    v12.4s
3025+    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
3026+    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
3027+    sub             v2.4s,    v2.4s,    v12.4s
3028+    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L.4h <-> ROW2R.4h */
3029+    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
3030+    sub             v6.4s,    v6.4s,    v4.4s
3031+    shrn            ROW6R.4h, v2.4s,    #16
3032+    add             v2.4s,    v6.4s,    v10.4s
3033+    sub             v6.4s,    v6.4s,    v10.4s
3034+    saddl           v10.4s,   ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
3035+    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
3036+    shrn            ROW5R.4h, v6.4s,    #16
3037+    shl             v10.4s,   v10.4s,   #13
3038+    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
3039+    add             v4.4s,    v10.4s,   v12.4s
3040+    sub             v2.4s,    v10.4s,   v12.4s
3041+    add             v12.4s,   v4.4s,    v14.4s
3042+    sub             v4.4s,    v4.4s,    v14.4s
3043+    add             v10.4s,   v2.4s,    v8.4s
3044+    sub             v6.4s,    v2.4s,    v8.4s
3045+    shrn            ROW7R.4h, v4.4s,    #16
3046+    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
3047+    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
3048+    shrn            ROW4R.4h, v6.4s,    #16
3049+
3050+2:  /* Descale to 8-bit and range limit */
3051+    ins             v16.2d[1], v17.2d[0]
3052+    ins             v18.2d[1], v19.2d[0]
3053+    ins             v20.2d[1], v21.2d[0]
3054+    ins             v22.2d[1], v23.2d[0]
3055+    sqrshrn         v16.8b,   v16.8h,   #2
3056+    sqrshrn2        v16.16b,  v18.8h,   #2
3057+    sqrshrn         v18.8b,   v20.8h,   #2
3058+    sqrshrn2        v18.16b,  v22.8h,   #2
3059+
3060+    /* vpop            {v8.4h - d15.4h} */ /* restore NEON registers */
3061+    ld1             {v8.4h - v11.4h}, [sp], 32
3062+    ld1             {v12.4h - v15.4h}, [sp], 32
3063+    ins             v24.2d[1], v25.2d[0]
3064+
3065+    sqrshrn         v20.8b,   v24.8h,   #2
3066+      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
3067+    /* trn1            v16.8h,    v16.8h,  v18.8h */
3068+    transpose       v16, v18, v3, .16b, .8h
3069+    ins             v26.2d[1], v27.2d[0]
3070+    ins             v28.2d[1], v29.2d[0]
3071+    ins             v30.2d[1], v31.2d[0]
3072+    sqrshrn2        v20.16b,  v26.8h,   #2
3073+    sqrshrn         v22.8b,   v28.8h,   #2
3074+    movi            v0.16b,   #(CENTERJSAMPLE)
3075+    sqrshrn2        v22.16b,  v30.8h,   #2
3076+    transpose_single v16, v17, v3, .2d, .8b
3077+    transpose_single v18, v19, v3, .2d, .8b
3078+    add             v16.8b,   v16.8b,   v0.8b
3079+    add             v17.8b,   v17.8b,   v0.8b
3080+    add             v18.8b,   v18.8b,   v0.8b
3081+    add             v19.8b,   v19.8b,   v0.8b
3082+    transpose       v20, v22, v3, .16b, .8h
3083+    /* Store results to the output buffer */
3084+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
3085+    add             TMP1,     TMP1,     OUTPUT_COL
3086+    add             TMP2,     TMP2,     OUTPUT_COL
3087+    st1             {v16.8b}, [TMP1]
3088+    transpose_single v20, v21, v3, .2d, .8b
3089+    st1             {v17.8b}, [TMP2]
3090+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
3091+    add             TMP1,     TMP1,     OUTPUT_COL
3092+    add             TMP2,     TMP2,     OUTPUT_COL
3093+    st1             {v18.8b}, [TMP1]
3094+    add             v20.8b,   v20.8b,   v0.8b
3095+    add             v21.8b,   v21.8b,   v0.8b
3096+    st1             {v19.8b}, [TMP2]
3097+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
3098+    ldp             TMP3,     TMP4,     [OUTPUT_BUF]
3099+    add             TMP1,     TMP1,     OUTPUT_COL
3100+    add             TMP2,     TMP2,     OUTPUT_COL
3101+    add             TMP3,     TMP3,     OUTPUT_COL
3102+    add             TMP4,     TMP4,     OUTPUT_COL
3103+    transpose_single v22, v23, v3, .2d, .8b
3104+    st1             {v20.8b}, [TMP1]
3105+    add             v22.8b,   v22.8b,   v0.8b
3106+    add             v23.8b,   v23.8b,   v0.8b
3107+    st1             {v21.8b}, [TMP2]
3108+    st1             {v22.8b}, [TMP3]
3109+    st1             {v23.8b}, [TMP4]
3110+    ldr             x15, [sp], 16
3111+    ld1             {v0.8b - v3.8b}, [sp], 32
3112+    ld1             {v4.8b - v7.8b}, [sp], 32
3113+    ld1             {v8.8b - v11.8b}, [sp], 32
3114+    ld1             {v12.8b - v15.8b}, [sp], 32
3115+    ld1             {v16.8b - v19.8b}, [sp], 32
3116+    ld1             {v20.8b - v23.8b}, [sp], 32
3117+    ld1             {v24.8b - v27.8b}, [sp], 32
3118+    ld1             {v28.8b - v31.8b}, [sp], 32
3119+    blr             x30
3120+
3121+3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
3122+
3123+    /* Transpose left 4x8 half */
3124+    transpose       ROW6L, ROW7L, v3, .16b, .4h
3125+    transpose       ROW2L, ROW3L, v3, .16b, .4h
3126+    transpose       ROW0L, ROW1L, v3, .16b, .4h
3127+    transpose       ROW4L, ROW5L, v3, .16b, .4h
3128+    shl             ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
3129+    transpose       ROW1L, ROW3L, v3, .16b, .2s
3130+    transpose       ROW4L, ROW6L, v3, .16b, .2s
3131+    transpose       ROW0L, ROW2L, v3, .16b, .2s
3132+    transpose       ROW5L, ROW7L, v3, .16b, .2s
3133+    cmp             x0, #0
3134+    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
3135+
3136+    /* Only row 0 is non-zero for the right 4x8 half  */
3137+    dup             ROW1R.4h, ROW0R.4h[1]
3138+    dup             ROW2R.4h, ROW0R.4h[2]
3139+    dup             ROW3R.4h, ROW0R.4h[3]
3140+    dup             ROW4R.4h, ROW0R.4h[0]
3141+    dup             ROW5R.4h, ROW0R.4h[1]
3142+    dup             ROW6R.4h, ROW0R.4h[2]
3143+    dup             ROW7R.4h, ROW0R.4h[3]
3144+    dup             ROW0R.4h, ROW0R.4h[0]
3145+    b               1b /* Go to 'normal' second pass */
3146+
3147+4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
3148+    ld1             {v2.4h},  [x15]    /* reload constants */
3149+    smull           v12.4s,   ROW1L.4h, XFIX_1_175875602
3150+    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
3151+    smull           v14.4s,   ROW3L.4h, XFIX_1_175875602
3152+    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
3153+    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
3154+    sshll           v6.4s,    ROW0L.4h, #13
3155+    mov             v8.16b,   v12.16b
3156+    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
3157+    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
3158+    add             v2.4s,    v6.4s,    v4.4s
3159+    mov             v10.16b,  v14.16b
3160+    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
3161+    add             v2.4s,    v2.4s,    v12.4s
3162+    add             v12.4s,   v12.4s,   v12.4s
3163+    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
3164+    shrn            ROW1L.4h, v2.4s,    #16
3165+    sub             v2.4s,    v2.4s,    v12.4s
3166+    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
3167+    sub             v6.4s,    v6.4s,    v4.4s
3168+    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
3169+    add             v2.4s,    v6.4s,    v10.4s
3170+    sub             v6.4s,    v6.4s,    v10.4s
3171+    sshll           v10.4s,   ROW0L.4h, #13
3172+    shrn            ROW2L.4h, v2.4s,    #16
3173+    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
3174+    add             v4.4s,    v10.4s,   v12.4s
3175+    sub             v2.4s,    v10.4s,   v12.4s
3176+    add             v12.4s,   v4.4s,    v14.4s
3177+    sub             v4.4s,    v4.4s,    v14.4s
3178+    add             v10.4s,   v2.4s,    v8.4s
3179+    sub             v6.4s,    v2.4s,    v8.4s
3180+    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
3181+    shrn            ROW3L.4h, v10.4s,   #16
3182+    shrn            ROW0L.4h, v12.4s,   #16
3183+    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
3184+    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
3185+    ld1             {v2.4h},  [x15]    /* reload constants */
3186+    smull           v12.4s,   ROW5L.4h, XFIX_1_175875602
3187+    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
3188+    smull           v14.4s,   ROW7L.4h, XFIX_1_175875602
3189+    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
3190+    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100
3191+    sshll           v6.4s,    ROW4L.4h, #13
3192+    mov             v8.16b,   v12.16b
3193+    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
3194+    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223
3195+    add             v2.4s,    v6.4s,    v4.4s
3196+    mov             v10.16b,  v14.16b
3197+    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
3198+    add             v2.4s,    v2.4s,    v12.4s
3199+    add             v12.4s,   v12.4s,   v12.4s
3200+    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447
3201+    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
3202+    sub             v2.4s,    v2.4s,    v12.4s
3203+    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
3204+    sub             v6.4s,    v6.4s,    v4.4s
3205+    shrn            ROW6R.4h, v2.4s,    #16
3206+    add             v2.4s,    v6.4s,    v10.4s
3207+    sub             v6.4s,    v6.4s,    v10.4s
3208+    sshll           v10.4s,   ROW4L.4h, #13
3209+    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
3210+    shrn            ROW5R.4h, v6.4s,    #16
3211+    add             v4.4s,    v10.4s,   v12.4s
3212+    sub             v2.4s,    v10.4s,   v12.4s
3213+    add             v12.4s,   v4.4s,    v14.4s
3214+    sub             v4.4s,    v4.4s,    v14.4s
3215+    add             v10.4s,   v2.4s,    v8.4s
3216+    sub             v6.4s,    v2.4s,    v8.4s
3217+    shrn            ROW7R.4h, v4.4s,    #16
3218+    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
3219+    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
3220+    shrn            ROW4R.4h, v6.4s,    #16
3221+    b               2b /* Go to epilogue */
3222+
3223+    .unreq          DCT_TABLE
3224+    .unreq          COEF_BLOCK
3225+    .unreq          OUTPUT_BUF
3226+    .unreq          OUTPUT_COL
3227+    .unreq          TMP1
3228+    .unreq          TMP2
3229+    .unreq          TMP3
3230+    .unreq          TMP4
3231+
3232+    .unreq          ROW0L
3233+    .unreq          ROW0R
3234+    .unreq          ROW1L
3235+    .unreq          ROW1R
3236+    .unreq          ROW2L
3237+    .unreq          ROW2R
3238+    .unreq          ROW3L
3239+    .unreq          ROW3R
3240+    .unreq          ROW4L
3241+    .unreq          ROW4R
3242+    .unreq          ROW5L
3243+    .unreq          ROW5R
3244+    .unreq          ROW6L
3245+    .unreq          ROW6R
3246+    .unreq          ROW7L
3247+    .unreq          ROW7R
3248+
3249+
3250+/*****************************************************************************/
3251+
3252+/*
3253+ * jsimd_idct_ifast_neon
3254+ *
3255+ * This function contains a fast, not so accurate integer implementation of
3256+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
3257+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
3258+ * function from jidctfst.c
3259+ *
3260+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
3261+ * But in ARM NEON case some extra additions are required because VQDMULH
3262+ * instruction can't handle the constants larger than 1. So the expressions
3263+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
3264+ * which introduces an extra addition. Overall, there are 6 extra additions
3265+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
3266+ */
3267+
3268+#define XFIX_1_082392200 v0.4h[0]
3269+#define XFIX_1_414213562 v0.4h[1]
3270+#define XFIX_1_847759065 v0.4h[2]
3271+#define XFIX_2_613125930 v0.4h[3]
3272+
3273+.balign 16
3274+jsimd_idct_ifast_neon_consts:
3275+    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
3276+    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
3277+    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
3278+    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
3279+
3280+asm_function jsimd_idct_ifast_neon
3281+
3282+    DCT_TABLE       .req x0
3283+    COEF_BLOCK      .req x1
3284+    OUTPUT_BUF      .req x2
3285+    OUTPUT_COL      .req x3
3286+    TMP1            .req x0
3287+    TMP2            .req x1
3288+    TMP3            .req x2
3289+    TMP4            .req x22
3290+    TMP5            .req x23
3291+
3292+    /* Load and dequantize coefficients into NEON registers
3293+     * with the following allocation:
3294+     *       0 1 2 3 | 4 5 6 7
3295+     *      ---------+--------
3296+     *   0 | d16     | d17     ( v8.8h  )
3297+     *   1 | d18     | d19     ( v9.8h  )
3298+     *   2 | d20     | d21     ( v10.8h )
3299+     *   3 | d22     | d23     ( v11.8h )
3300+     *   4 | d24     | d25     ( v12.8h )
3301+     *   5 | d26     | d27     ( v13.8h )
3302+     *   6 | d28     | d29     ( v14.8h )
3303+     *   7 | d30     | d31     ( v15.8h )
3304+     */
3305+    /* Save NEON registers used in fast IDCT */
3306+    sub             sp, sp, #176
3307+    stp             x22, x23, [sp], 16
3308+    adr             x23, jsimd_idct_ifast_neon_consts
3309+    st1             {v0.8b - v3.8b}, [sp], 32
3310+    st1             {v4.8b - v7.8b}, [sp], 32
3311+    st1             {v8.8b - v11.8b}, [sp], 32
3312+    st1             {v12.8b - v15.8b}, [sp], 32
3313+    st1             {v16.8b - v19.8b}, [sp], 32
3314+    ld1             {v8.8h, v9.8h}, [COEF_BLOCK], 32
3315+    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
3316+    ld1             {v10.8h, v11.8h}, [COEF_BLOCK], 32
3317+    mul             v8.8h,  v8.8h,  v0.8h
3318+    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
3319+    mul             v9.8h,  v9.8h,  v1.8h
3320+    ld1             {v12.8h, v13.8h}, [COEF_BLOCK], 32
3321+    mul             v10.8h, v10.8h, v2.8h
3322+    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
3323+    mul             v11.8h, v11.8h, v3.8h
3324+    ld1             {v14.8h, v15.8h}, [COEF_BLOCK], 32
3325+    mul             v12.8h, v12.8h, v0.8h
3326+    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
3327+    mul             v14.8h, v14.8h, v2.8h
3328+    mul             v13.8h, v13.8h, v1.8h
3329+    ld1             {v0.4h}, [x23]      /* load constants */
3330+    mul             v15.8h, v15.8h, v3.8h
3331+
3332+    /* 1-D IDCT, pass 1 */
3333+    sub             v2.8h,    v10.8h,   v14.8h
3334+    add             v14.8h,   v10.8h,   v14.8h
3335+    sub             v1.8h,    v11.8h,   v13.8h
3336+    add             v13.8h,   v11.8h,   v13.8h
3337+    sub             v5.8h,    v9.8h,    v15.8h
3338+    add             v15.8h,   v9.8h,    v15.8h
3339+    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
3340+    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
3341+    add             v3.8h,    v1.8h,    v1.8h
3342+    sub             v1.8h,    v5.8h,    v1.8h
3343+    add             v10.8h,   v2.8h,    v4.8h
3344+    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
3345+    sub             v2.8h,    v15.8h,   v13.8h
3346+    add             v3.8h,    v3.8h,    v6.8h
3347+    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
3348+    add             v1.8h,    v1.8h,    v4.8h
3349+    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
3350+    sub             v10.8h,   v10.8h,   v14.8h
3351+    add             v2.8h,    v2.8h,    v6.8h
3352+    sub             v6.8h,    v8.8h,    v12.8h
3353+    add             v12.8h,   v8.8h,    v12.8h
3354+    add             v9.8h,    v5.8h,    v4.8h
3355+    add             v5.8h,    v6.8h,    v10.8h
3356+    sub             v10.8h,   v6.8h,    v10.8h
3357+    add             v6.8h,    v15.8h,   v13.8h
3358+    add             v8.8h,    v12.8h,   v14.8h
3359+    sub             v3.8h,    v6.8h,    v3.8h
3360+    sub             v12.8h,   v12.8h,   v14.8h
3361+    sub             v3.8h,    v3.8h,    v1.8h
3362+    sub             v1.8h,    v9.8h,    v1.8h
3363+    add             v2.8h,    v3.8h,    v2.8h
3364+    sub             v15.8h,   v8.8h,    v6.8h
3365+    add             v1.8h,    v1.8h,    v2.8h
3366+    add             v8.8h,    v8.8h,    v6.8h
3367+    add             v14.8h,   v5.8h,    v3.8h
3368+    sub             v9.8h,    v5.8h,    v3.8h
3369+    sub             v13.8h,   v10.8h,   v2.8h
3370+    add             v10.8h,   v10.8h,   v2.8h
3371+    /* Transpose  q8-q9 */
3372+    mov             v18.16b,  v8.16b
3373+    trn1            v8.8h,    v8.8h,    v9.8h
3374+    trn2            v9.8h,    v18.8h,   v9.8h
3375+    sub             v11.8h,   v12.8h,   v1.8h
3376+    /* Transpose  q14-q15 */
3377+    mov             v18.16b,  v14.16b
3378+    trn1            v14.8h,   v14.8h,   v15.8h
3379+    trn2            v15.8h,   v18.8h,   v15.8h
3380+    add             v12.8h,   v12.8h,   v1.8h
3381+    /* Transpose  q10-q11 */
3382+    mov             v18.16b,  v10.16b
3383+    trn1            v10.8h,   v10.8h,   v11.8h
3384+    trn2            v11.8h,   v18.8h,   v11.8h
3385+    /* Transpose  q12-q13 */
3386+    mov             v18.16b,  v12.16b
3387+    trn1            v12.8h,   v12.8h,   v13.8h
3388+    trn2            v13.8h,   v18.8h,   v13.8h
3389+    /* Transpose  q9-q11 */
3390+    mov             v18.16b,  v9.16b
3391+    trn1            v9.4s,    v9.4s,    v11.4s
3392+    trn2            v11.4s,   v18.4s,   v11.4s
3393+    /* Transpose  q12-q14 */
3394+    mov             v18.16b,  v12.16b
3395+    trn1            v12.4s,   v12.4s,   v14.4s
3396+    trn2            v14.4s,   v18.4s,   v14.4s
3397+    /* Transpose  q8-q10 */
3398+    mov             v18.16b,  v8.16b
3399+    trn1            v8.4s,    v8.4s,    v10.4s
3400+    trn2            v10.4s,   v18.4s,   v10.4s
3401+    /* Transpose  q13-q15 */
3402+    mov             v18.16b,  v13.16b
3403+    trn1            v13.4s,   v13.4s,   v15.4s
3404+    trn2            v15.4s,   v18.4s,   v15.4s
3405+    /* vswp            v14.4h,   v10-MSB.4h */
3406+    umov            x22, v14.d[0]
3407+    ins             v14.2d[0], v10.2d[1]
3408+    ins             v10.2d[1], x22
3409+    /* vswp            v13.4h,   v9MSB.4h */
3410+
3411+    umov            x22, v13.d[0]
3412+    ins             v13.2d[0], v9.2d[1]
3413+    ins             v9.2d[1], x22
3414+    /* 1-D IDCT, pass 2 */
3415+    sub             v2.8h,    v10.8h,   v14.8h
3416+    /* vswp            v15.4h,   v11MSB.4h */
3417+    umov            x22, v15.d[0]
3418+    ins             v15.2d[0], v11.2d[1]
3419+    ins             v11.2d[1], x22
3420+    add             v14.8h,   v10.8h,   v14.8h
3421+    /* vswp            v12.4h,   v8-MSB.4h */
3422+    umov            x22, v12.d[0]
3423+    ins             v12.2d[0], v8.2d[1]
3424+    ins             v8.2d[1], x22
3425+    sub             v1.8h,    v11.8h,   v13.8h
3426+    add             v13.8h,   v11.8h,   v13.8h
3427+    sub             v5.8h,    v9.8h,    v15.8h
3428+    add             v15.8h,   v9.8h,    v15.8h
3429+    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
3430+    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
3431+    add             v3.8h,    v1.8h,    v1.8h
3432+    sub             v1.8h,    v5.8h,    v1.8h
3433+    add             v10.8h,   v2.8h,    v4.8h
3434+    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
3435+    sub             v2.8h,    v15.8h,   v13.8h
3436+    add             v3.8h,    v3.8h,    v6.8h
3437+    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
3438+    add             v1.8h,    v1.8h,    v4.8h
3439+    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
3440+    sub             v10.8h,   v10.8h,   v14.8h
3441+    add             v2.8h,    v2.8h,    v6.8h
3442+    sub             v6.8h,    v8.8h,    v12.8h
3443+    add             v12.8h,   v8.8h,    v12.8h
3444+    add             v9.8h,    v5.8h,    v4.8h
3445+    add             v5.8h,    v6.8h,    v10.8h
3446+    sub             v10.8h,   v6.8h,    v10.8h
3447+    add             v6.8h,    v15.8h,   v13.8h
3448+    add             v8.8h,    v12.8h,   v14.8h
3449+    sub             v3.8h,    v6.8h,    v3.8h
3450+    sub             v12.8h,   v12.8h,   v14.8h
3451+    sub             v3.8h,    v3.8h,    v1.8h
3452+    sub             v1.8h,    v9.8h,    v1.8h
3453+    add             v2.8h,    v3.8h,    v2.8h
3454+    sub             v15.8h,   v8.8h,    v6.8h
3455+    add             v1.8h,    v1.8h,    v2.8h
3456+    add             v8.8h,    v8.8h,    v6.8h
3457+    add             v14.8h,   v5.8h,    v3.8h
3458+    sub             v9.8h,    v5.8h,    v3.8h
3459+    sub             v13.8h,   v10.8h,   v2.8h
3460+    add             v10.8h,   v10.8h,   v2.8h
3461+    sub             v11.8h,   v12.8h,   v1.8h
3462+    add             v12.8h,   v12.8h,   v1.8h
3463+    /* Descale to 8-bit and range limit */
3464+    movi            v0.16b,   #0x80
3465+    sqshrn          v8.8b,    v8.8h,    #5
3466+    sqshrn2         v8.16b,   v9.8h,    #5
3467+    sqshrn          v9.8b,    v10.8h,   #5
3468+    sqshrn2         v9.16b,   v11.8h,   #5
3469+    sqshrn          v10.8b,   v12.8h,   #5
3470+    sqshrn2         v10.16b,  v13.8h,   #5
3471+    sqshrn          v11.8b,   v14.8h,   #5
3472+    sqshrn2         v11.16b,  v15.8h,   #5
3473+    add             v8.16b,   v8.16b,   v0.16b
3474+    add             v9.16b,   v9.16b,   v0.16b
3475+    add             v10.16b,  v10.16b,  v0.16b
3476+    add             v11.16b,  v11.16b,  v0.16b
3477+    /* Transpose the final 8-bit samples */
3478+    /* Transpose  q8-q9 */
3479+    mov             v18.16b,  v8.16b
3480+    trn1            v8.8h,    v8.8h,    v9.8h
3481+    trn2            v9.8h,    v18.8h,   v9.8h
3482+    /* Transpose  q10-q11 */
3483+    mov             v18.16b,  v10.16b
3484+    trn1            v10.8h,   v10.8h,   v11.8h
3485+    trn2            v11.8h,   v18.8h,   v11.8h
3486+    /* Transpose  q8-q10 */
3487+    mov             v18.16b,  v8.16b
3488+    trn1            v8.4s,    v8.4s,    v10.4s
3489+    trn2            v10.4s,   v18.4s,   v10.4s
3490+    /* Transpose  q9-q11 */
3491+    mov             v18.16b,  v9.16b
3492+    trn1            v9.4s,    v9.4s,    v11.4s
3493+    trn2            v11.4s,   v18.4s,   v11.4s
3494+    /* make copy */
3495+    ins             v17.2d[0], v8.2d[1]
3496+    /* Transpose  d16-d17-msb */
3497+    mov             v18.16b,  v8.16b
3498+    trn1            v8.8b,    v8.8b,    v17.8b
3499+    trn2            v17.8b,   v18.8b,   v17.8b
3500+    /* make copy */
3501+    ins             v19.2d[0], v9.2d[1]
3502+    mov             v18.16b,  v9.16b
3503+    trn1            v9.8b,    v9.8b,    v19.8b
3504+    trn2            v19.8b,   v18.8b,   v19.8b
3505+    /* Store results to the output buffer */
3506+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
3507+    add             TMP1,     TMP1,     OUTPUT_COL
3508+    add             TMP2,     TMP2,     OUTPUT_COL
3509+    st1             {v8.8b},  [TMP1]
3510+    st1             {v17.8b}, [TMP2]
3511+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
3512+    add             TMP1,     TMP1,     OUTPUT_COL
3513+    add             TMP2,     TMP2,     OUTPUT_COL
3514+    st1             {v9.8b},  [TMP1]
3515+    /* make copy */
3516+    ins             v7.2d[0], v10.2d[1]
3517+    mov             v18.16b,  v10.16b
3518+    trn1            v10.8b,   v10.8b,   v7.8b
3519+    trn2            v7.8b,    v18.8b,   v7.8b
3520+    st1             {v19.8b}, [TMP2]
3521+    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
3522+    ldp             TMP4,     TMP5,     [OUTPUT_BUF], 16
3523+    add             TMP1,     TMP1,     OUTPUT_COL
3524+    add             TMP2,     TMP2,     OUTPUT_COL
3525+    add             TMP4,     TMP4,     OUTPUT_COL
3526+    add             TMP5,     TMP5,     OUTPUT_COL
3527+    st1             {v10.8b}, [TMP1]
3528+    /* make copy */
3529+    ins             v16.2d[0], v11.2d[1]
3530+    mov             v18.16b,  v11.16b
3531+    trn1            v11.8b,   v11.8b,   v16.8b
3532+    trn2            v16.8b,   v18.8b,   v16.8b
3533+    st1             {v7.8b},  [TMP2]
3534+    st1             {v11.8b}, [TMP4]
3535+    st1             {v16.8b}, [TMP5]
3536+    sub             sp, sp, #176
3537+    ldp             x22, x23, [sp], 16
3538+    ld1             {v0.8b - v3.8b}, [sp], 32
3539+    ld1             {v4.8b - v7.8b}, [sp], 32
3540+    ld1             {v8.8b - v11.8b}, [sp], 32
3541+    ld1             {v12.8b - v15.8b}, [sp], 32
3542+    ld1             {v16.8b - v19.8b}, [sp], 32
3543+    blr             x30
3544+
3545+    .unreq          DCT_TABLE
3546+    .unreq          COEF_BLOCK
3547+    .unreq          OUTPUT_BUF
3548+    .unreq          OUTPUT_COL
3549+    .unreq          TMP1
3550+    .unreq          TMP2
3551+    .unreq          TMP3
3552+    .unreq          TMP4
3553+
3554+
3555+/*****************************************************************************/
3556+
3557+/*
3558+ * jsimd_idct_4x4_neon
3559+ *
3560+ * This function contains inverse-DCT code for getting reduced-size
3561+ * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
3562+ * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
3563+ * function from jpeg-6b (jidctred.c).
3564+ *
3565+ * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
3566+ *       requires much less arithmetic operations and hence should be faster.
3567+ *       The primary purpose of this particular NEON optimized function is
3568+ *       bit exact compatibility with jpeg-6b.
3569+ *
3570+ * TODO: a bit better instructions scheduling can be achieved by expanding
3571+ *       idct_helper/transpose_4x4 macros and reordering instructions,
3572+ *       but readability will suffer somewhat.
3573+ */
3574+
3575+#define CONST_BITS  13
3576+
3577+#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
3578+#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
3579+#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
3580+#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
3581+#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
3582+#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
3583+#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
3584+#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
3585+#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
3586+#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
3587+#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
3588+#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
3589+#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
3590+#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
3591+
3592+.balign 16
3593+jsimd_idct_4x4_neon_consts:
3594+    .short     FIX_1_847759065     /* v0.4h[0] */
3595+    .short     -FIX_0_765366865    /* v0.4h[1] */
3596+    .short     -FIX_0_211164243    /* v0.4h[2] */
3597+    .short     FIX_1_451774981     /* v0.4h[3] */
3598+    .short     -FIX_2_172734803    /* d1[0] */
3599+    .short     FIX_1_061594337     /* d1[1] */
3600+    .short     -FIX_0_509795579    /* d1[2] */
3601+    .short     -FIX_0_601344887    /* d1[3] */
3602+    .short     FIX_0_899976223     /* v2.4h[0] */
3603+    .short     FIX_2_562915447     /* v2.4h[1] */
3604+    .short     1 << (CONST_BITS+1) /* v2.4h[2] */
3605+    .short     0                   /* v2.4h[3] */
3606+
3607+.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
3608+    smull           v28.4s, \x4,    v2.4h[2]
3609+    smlal           v28.4s, \x8,    v0.4h[0]
3610+    smlal           v28.4s, \x14,   v0.4h[1]
3611+
3612+    smull           v26.4s, \x16,   v1.4h[2]
3613+    smlal           v26.4s, \x12,   v1.4h[3]
3614+    smlal           v26.4s, \x10,   v2.4h[0]
3615+    smlal           v26.4s, \x6,    v2.4h[1]
3616+
3617+    smull           v30.4s, \x4,    v2.4h[2]
3618+    smlsl           v30.4s, \x8,    v0.4h[0]
3619+    smlsl           v30.4s, \x14,   v0.4h[1]
3620+
3621+    smull           v24.4s, \x16,   v0.4h[2]
3622+    smlal           v24.4s, \x12,   v0.4h[3]
3623+    smlal           v24.4s, \x10,   v1.4h[0]
3624+    smlal           v24.4s, \x6,    v1.4h[1]
3625+
3626+    add             v20.4s, v28.4s, v26.4s
3627+    sub             v28.4s, v28.4s, v26.4s
3628+
3629+.if \shift > 16
3630+    srshr           v20.4s, v20.4s, #\shift
3631+    srshr           v28.4s, v28.4s, #\shift
3632+    xtn             \y26,   v20.4s
3633+    xtn             \y29,   v28.4s
3634+.else
3635+    rshrn           \y26,   v20.4s, #\shift
3636+    rshrn           \y29,   v28.4s, #\shift
3637+.endif
3638+
3639+    add             v20.4s, v30.4s, v24.4s
3640+    sub             v30.4s, v30.4s, v24.4s
3641+
3642+.if \shift > 16
3643+    srshr           v20.4s, v20.4s, #\shift
3644+    srshr           v30.4s, v30.4s, #\shift
3645+    xtn             \y27,   v20.4s
3646+    xtn             \y28,   v30.4s
3647+.else
3648+    rshrn           \y27,   v20.4s, #\shift
3649+    rshrn           \y28,   v30.4s, #\shift
3650+.endif
3651+
3652+.endm
3653+
3654+asm_function jsimd_idct_4x4_neon
3655+
3656+    DCT_TABLE       .req x0
3657+    COEF_BLOCK      .req x1
3658+    OUTPUT_BUF      .req x2
3659+    OUTPUT_COL      .req x3
3660+    TMP1            .req x0
3661+    TMP2            .req x1
3662+    TMP3            .req x2
3663+    TMP4            .req x15
3664+
3665+    /* Save all used NEON registers */
3666+    sub             sp, sp, 272
3667+    str             x15, [sp], 16
3668+    /* Load constants (v3.4h is just used for padding) */
3669+    adr             TMP4, jsimd_idct_4x4_neon_consts
3670+    st1             {v0.8b - v3.8b}, [sp], 32
3671+    st1             {v4.8b - v7.8b}, [sp], 32
3672+    st1             {v8.8b - v11.8b}, [sp], 32
3673+    st1             {v12.8b - v15.8b}, [sp], 32
3674+    st1             {v16.8b - v19.8b}, [sp], 32
3675+    st1             {v20.8b - v23.8b}, [sp], 32
3676+    st1             {v24.8b - v27.8b}, [sp], 32
3677+    st1             {v28.8b - v31.8b}, [sp], 32
3678+    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
3679+
3680+    /* Load all COEF_BLOCK into NEON registers with the following allocation:
3681+     *       0 1 2 3 | 4 5 6 7
3682+     *      ---------+--------
3683+     *   0 | v4.4h   | v5.4h
3684+     *   1 | v6.4h   | v7.4h
3685+     *   2 | v8.4h   | v9.4h
3686+     *   3 | v10.4h  | v11.4h
3687+     *   4 | -       | -
3688+     *   5 | v12.4h  | v13.4h
3689+     *   6 | v14.4h  | v15.4h
3690+     *   7 | v16.4h  | v17.4h
3691+     */
3692+    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
3693+    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
3694+    add             COEF_BLOCK, COEF_BLOCK, #16
3695+    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
3696+    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
3697+    /* dequantize */
3698+    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
3699+    mul             v4.4h, v4.4h, v18.4h
3700+    mul             v5.4h, v5.4h, v19.4h
3701+    ins             v4.2d[1], v5.2d[0]    /* 128 bit q4 */
3702+    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
3703+    mul             v6.4h, v6.4h, v20.4h
3704+    mul             v7.4h, v7.4h, v21.4h
3705+    ins             v6.2d[1], v7.2d[0]    /* 128 bit q6 */
3706+    mul             v8.4h, v8.4h, v22.4h
3707+    mul             v9.4h, v9.4h, v23.4h
3708+    ins             v8.2d[1], v9.2d[0]    /* 128 bit q8 */
3709+    add             DCT_TABLE, DCT_TABLE, #16
3710+    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
3711+    mul             v10.4h, v10.4h, v24.4h
3712+    mul             v11.4h, v11.4h, v25.4h
3713+    ins             v10.2d[1], v11.2d[0]  /* 128 bit q10 */
3714+    mul             v12.4h, v12.4h, v26.4h
3715+    mul             v13.4h, v13.4h, v27.4h
3716+    ins             v12.2d[1], v13.2d[0]  /* 128 bit q12 */
3717+    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
3718+    mul             v14.4h, v14.4h, v28.4h
3719+    mul             v15.4h, v15.4h, v29.4h
3720+    ins             v14.2d[1], v15.2d[0]  /* 128 bit q14 */
3721+    mul             v16.4h, v16.4h, v30.4h
3722+    mul             v17.4h, v17.4h, v31.4h
3723+    ins             v16.2d[1], v17.2d[0]  /* 128 bit q16 */
3724+
3725+    /* Pass 1 */
3726+    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
3727+    transpose_4x4   v4, v6, v8, v10, v3
3728+    ins             v10.2d[1], v11.2d[0]
3729+    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
3730+    transpose_4x4   v5, v7, v9, v11, v3
3731+    ins             v10.2d[1], v11.2d[0]
3732+    /* Pass 2 */
3733+    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
3734+    transpose_4x4   v26, v27, v28, v29, v3
3735+
3736+    /* Range limit */
3737+    movi            v30.8h, #0x80
3738+    ins             v26.2d[1], v27.2d[0]
3739+    ins             v28.2d[1], v29.2d[0]
3740+    add             v26.8h, v26.8h, v30.8h
3741+    add             v28.8h, v28.8h, v30.8h
3742+    sqxtun          v26.8b, v26.8h
3743+    sqxtun          v27.8b, v28.8h
3744+
3745+    /* Store results to the output buffer */
3746+    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
3747+    ldp             TMP3, TMP4, [OUTPUT_BUF]
3748+    add             TMP1, TMP1, OUTPUT_COL
3749+    add             TMP2, TMP2, OUTPUT_COL
3750+    add             TMP3, TMP3, OUTPUT_COL
3751+    add             TMP4, TMP4, OUTPUT_COL
3752+
3753+#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
3754+    /* We can use much less instructions on little endian systems if the
3755+     * OS kernel is not configured to trap unaligned memory accesses
3756+     */
3757+    st1             {v26.s}[0], [TMP1], 4
3758+    st1             {v27.s}[0], [TMP3], 4
3759+    st1             {v26.s}[1], [TMP2], 4
3760+    st1             {v27.s}[1], [TMP4], 4
3761+#else
3762+    st1             {v26.b}[0], [TMP1], 1
3763+    st1             {v27.b}[0], [TMP3], 1
3764+    st1             {v26.b}[1], [TMP1], 1
3765+    st1             {v27.b}[1], [TMP3], 1
3766+    st1             {v26.b}[2], [TMP1], 1
3767+    st1             {v27.b}[2], [TMP3], 1
3768+    st1             {v26.b}[3], [TMP1], 1
3769+    st1             {v27.b}[3], [TMP3], 1
3770+
3771+    st1             {v26.b}[4], [TMP2], 1
3772+    st1             {v27.b}[4], [TMP4], 1
3773+    st1             {v26.b}[5], [TMP2], 1
3774+    st1             {v27.b}[5], [TMP4], 1
3775+    st1             {v26.b}[6], [TMP2], 1
3776+    st1             {v27.b}[6], [TMP4], 1
3777+    st1             {v26.b}[7], [TMP2], 1
3778+    st1             {v27.b}[7], [TMP4], 1
3779+#endif
3780+
3781+    /* vpop            {v8.4h - v15.4h}    ;not available */
3782+    sub             sp, sp, #272
3783+    ldr             x15, [sp], 16
3784+    ld1             {v0.8b - v3.8b}, [sp], 32
3785+    ld1             {v4.8b - v7.8b}, [sp], 32
3786+    ld1             {v8.8b - v11.8b}, [sp], 32
3787+    ld1             {v12.8b - v15.8b}, [sp], 32
3788+    ld1             {v16.8b - v19.8b}, [sp], 32
3789+    ld1             {v20.8b - v23.8b}, [sp], 32
3790+    ld1             {v24.8b - v27.8b}, [sp], 32
3791+    ld1             {v28.8b - v31.8b}, [sp], 32
3792+    blr             x30
3793+
3794+    .unreq          DCT_TABLE
3795+    .unreq          COEF_BLOCK
3796+    .unreq          OUTPUT_BUF
3797+    .unreq          OUTPUT_COL
3798+    .unreq          TMP1
3799+    .unreq          TMP2
3800+    .unreq          TMP3
3801+    .unreq          TMP4
3802+
3803+.purgem idct_helper
3804+
3805+
3806+/*****************************************************************************/
3807+
3808+/*
3809+ * jsimd_idct_2x2_neon
3810+ *
3811+ * This function contains inverse-DCT code for getting reduced-size
3812+ * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
3813+ * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
3814+ * function from jpeg-6b (jidctred.c).
3815+ *
3816+ * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
3817+ *       requires much less arithmetic operations and hence should be faster.
3818+ *       The primary purpose of this particular NEON optimized function is
3819+ *       bit exact compatibility with jpeg-6b.
3820+ */
3821+
3822+.balign 8
3823+jsimd_idct_2x2_neon_consts:
3824+    .short     -FIX_0_720959822    /* v14[0] */
3825+    .short     FIX_0_850430095     /* v14[1] */
3826+    .short     -FIX_1_272758580    /* v14[2] */
3827+    .short     FIX_3_624509785     /* v14[3] */
3828+
3829+.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
3830+    sshll      v15.4s, \x4,    #15
3831+    smull      v26.4s, \x6,    v14.4h[3]
3832+    smlal      v26.4s, \x10,   v14.4h[2]
3833+    smlal      v26.4s, \x12,   v14.4h[1]
3834+    smlal      v26.4s, \x16,   v14.4h[0]
3835+
3836+    add        v20.4s, v15.4s, v26.4s
3837+    sub        v15.4s, v15.4s, v26.4s
3838+
3839+.if \shift > 16
3840+    srshr      v20.4s, v20.4s, #\shift
3841+    srshr      v15.4s, v15.4s, #\shift
3842+    xtn        \y26,   v20.4s
3843+    xtn        \y27,   v15.4s
3844+.else
3845+    rshrn      \y26,   v20.4s, #\shift
3846+    rshrn      \y27,   v15.4s, #\shift
3847+.endif
3848+
3849+.endm
3850+
3851+asm_function jsimd_idct_2x2_neon
3852+
3853+    DCT_TABLE       .req x0
3854+    COEF_BLOCK      .req x1
3855+    OUTPUT_BUF      .req x2
3856+    OUTPUT_COL      .req x3
3857+    TMP1            .req x0
3858+    TMP2            .req x15
3859+
3860+    /* vpush           {v8.4h - v15.4h}            ; not available */
3861+    sub             sp, sp, 208
3862+    str             x15, [sp], 16
3863+
3864+    /* Load constants */
3865+    adr             TMP2, jsimd_idct_2x2_neon_consts
3866+    st1             {v4.8b - v7.8b}, [sp], 32
3867+    st1             {v8.8b - v11.8b}, [sp], 32
3868+    st1             {v12.8b - v15.8b}, [sp], 32
3869+    st1             {v16.8b - v19.8b}, [sp], 32
3870+    st1             {v21.8b - v22.8b}, [sp], 16
3871+    st1             {v24.8b - v27.8b}, [sp], 32
3872+    st1             {v30.8b - v31.8b}, [sp], 16
3873+    ld1             {v14.4h}, [TMP2]
3874+
3875+    /* Load all COEF_BLOCK into NEON registers with the following allocation:
3876+     *       0 1 2 3 | 4 5 6 7
3877+     *      ---------+--------
3878+     *   0 | v4.4h   | v5.4h
3879+     *   1 | v6.4h   | v7.4h
3880+     *   2 | -       | -
3881+     *   3 | v10.4h  | v11.4h
3882+     *   4 | -       | -
3883+     *   5 | v12.4h  | v13.4h
3884+     *   6 | -       | -
3885+     *   7 | v16.4h  | v17.4h
3886+     */
3887+    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
3888+    add             COEF_BLOCK, COEF_BLOCK, #16
3889+    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
3890+    add             COEF_BLOCK, COEF_BLOCK, #16
3891+    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
3892+    add             COEF_BLOCK, COEF_BLOCK, #16
3893+    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
3894+    /* Dequantize */
3895+    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
3896+    mul             v4.4h, v4.4h, v18.4h
3897+    mul             v5.4h, v5.4h, v19.4h
3898+    ins             v4.2d[1], v5.2d[0]
3899+    mul             v6.4h, v6.4h, v20.4h
3900+    mul             v7.4h, v7.4h, v21.4h
3901+    ins             v6.2d[1], v7.2d[0]
3902+    add             DCT_TABLE, DCT_TABLE, #16
3903+    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
3904+    mul             v10.4h, v10.4h, v24.4h
3905+    mul             v11.4h, v11.4h, v25.4h
3906+    ins             v10.2d[1], v11.2d[0]
3907+    add             DCT_TABLE, DCT_TABLE, #16
3908+    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
3909+    mul             v12.4h, v12.4h, v26.4h
3910+    mul             v13.4h, v13.4h, v27.4h
3911+    ins             v12.2d[1], v13.2d[0]
3912+    add             DCT_TABLE, DCT_TABLE, #16
3913+    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
3914+    mul             v16.4h, v16.4h, v30.4h
3915+    mul             v17.4h, v17.4h, v31.4h
3916+    ins             v16.2d[1], v17.2d[0]
3917+
3918+    /* Pass 1 */
3919+#if 0
3920+    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
3921+    transpose_4x4   v4.4h, v6.4h, v8.4h,  v10.4h
3922+    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
3923+    transpose_4x4   v5.4h, v7.4h, v9.4h,  v11.4h
3924+#else
3925+    smull           v26.4s, v6.4h,  v14.4h[3]
3926+    smlal           v26.4s, v10.4h, v14.4h[2]
3927+    smlal           v26.4s, v12.4h, v14.4h[1]
3928+    smlal           v26.4s, v16.4h, v14.4h[0]
3929+    smull           v24.4s, v7.4h,  v14.4h[3]
3930+    smlal           v24.4s, v11.4h, v14.4h[2]
3931+    smlal           v24.4s, v13.4h, v14.4h[1]
3932+    smlal           v24.4s, v17.4h, v14.4h[0]
3933+    sshll           v15.4s, v4.4h,  #15
3934+    sshll           v30.4s, v5.4h,  #15
3935+    add             v20.4s, v15.4s, v26.4s
3936+    sub             v15.4s, v15.4s, v26.4s
3937+    rshrn           v4.4h,  v20.4s, #13
3938+    rshrn           v6.4h,  v15.4s, #13
3939+    add             v20.4s, v30.4s, v24.4s
3940+    sub             v15.4s, v30.4s, v24.4s
3941+    rshrn           v5.4h,  v20.4s, #13
3942+    rshrn           v7.4h,  v15.4s, #13
3943+    ins             v4.2d[1], v5.2d[0]
3944+    ins             v6.2d[1], v7.2d[0]
3945+    transpose       v4, v6, v3, .16b, .8h
3946+    transpose       v6, v10, v3, .16b, .4s
3947+    ins             v11.2d[0], v10.2d[1]
3948+    ins             v7.2d[0], v6.2d[1]
3949+#endif
3950+
3951+    /* Pass 2 */
3952+    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
3953+
3954+    /* Range limit */
3955+    movi            v30.8h, #0x80
3956+    ins             v26.2d[1], v27.2d[0]
3957+    add             v26.8h, v26.8h, v30.8h
3958+    sqxtun          v30.8b, v26.8h
3959+    ins             v26.2d[0], v30.2d[0]
3960+    sqxtun          v27.8b, v26.8h
3961+
3962+    /* Store results to the output buffer */
3963+    ldp             TMP1, TMP2, [OUTPUT_BUF]
3964+    add             TMP1, TMP1, OUTPUT_COL
3965+    add             TMP2, TMP2, OUTPUT_COL
3966+
3967+    st1             {v26.b}[0], [TMP1], 1
3968+    st1             {v27.b}[4], [TMP1], 1
3969+    st1             {v26.b}[1], [TMP2], 1
3970+    st1             {v27.b}[5], [TMP2], 1
3971+
3972+    sub             sp, sp, #208
3973+    ldr             x15, [sp], 16
3974+    ld1             {v4.8b - v7.8b}, [sp], 32
3975+    ld1             {v8.8b - v11.8b}, [sp], 32
3976+    ld1             {v12.8b - v15.8b}, [sp], 32
3977+    ld1             {v16.8b - v19.8b}, [sp], 32
3978+    ld1             {v21.8b - v22.8b}, [sp], 16
3979+    ld1             {v24.8b - v27.8b}, [sp], 32
3980+    ld1             {v30.8b - v31.8b}, [sp], 16
3981+    blr             x30
3982+
3983+    .unreq          DCT_TABLE
3984+    .unreq          COEF_BLOCK
3985+    .unreq          OUTPUT_BUF
3986+    .unreq          OUTPUT_COL
3987+    .unreq          TMP1
3988+    .unreq          TMP2
3989+
3990+.purgem idct_helper
3991+
3992+
3993+/*****************************************************************************/
3994+
3995+/*
3996+ * jsimd_ycc_extrgb_convert_neon
3997+ * jsimd_ycc_extbgr_convert_neon
3998+ * jsimd_ycc_extrgbx_convert_neon
3999+ * jsimd_ycc_extbgrx_convert_neon
4000+ * jsimd_ycc_extxbgr_convert_neon
4001+ * jsimd_ycc_extxrgb_convert_neon
4002+ *
4003+ * Colorspace conversion YCbCr -> RGB
4004+ */
4005+
4006+
4007+.macro do_load size
4008+    .if \size == 8
4009+        ld1  {v4.8b}, [U], 8
4010+        ld1  {v5.8b}, [V], 8
4011+        ld1  {v0.8b}, [Y], 8
4012+        prfm PLDL1KEEP, [U, #64]
4013+        prfm PLDL1KEEP, [V, #64]
4014+        prfm PLDL1KEEP, [Y, #64]
4015+    .elseif \size == 4
4016+        ld1  {v4.b}[0], [U], 1
4017+        ld1  {v4.b}[1], [U], 1
4018+        ld1  {v4.b}[2], [U], 1
4019+        ld1  {v4.b}[3], [U], 1
4020+        ld1  {v5.b}[0], [V], 1
4021+        ld1  {v5.b}[1], [V], 1
4022+        ld1  {v5.b}[2], [V], 1
4023+        ld1  {v5.b}[3], [V], 1
4024+        ld1  {v0.b}[0], [Y], 1
4025+        ld1  {v0.b}[1], [Y], 1
4026+        ld1  {v0.b}[2], [Y], 1
4027+        ld1  {v0.b}[3], [Y], 1
4028+    .elseif \size == 2
4029+        ld1  {v4.b}[4], [U], 1
4030+        ld1  {v4.b}[5], [U], 1
4031+        ld1  {v5.b}[4], [V], 1
4032+        ld1  {v5.b}[5], [V], 1
4033+        ld1  {v0.b}[4], [Y], 1
4034+        ld1  {v0.b}[5], [Y], 1
4035+    .elseif \size == 1
4036+        ld1  {v4.b}[6], [U], 1
4037+        ld1  {v5.b}[6], [V], 1
4038+        ld1  {v0.b}[6], [Y], 1
4039+    .else
4040+        .error unsupported macroblock size
4041+    .endif
4042+.endm
4043+
4044+.macro do_store bpp, size
4045+    .if \bpp == 24
4046+        .if \size == 8
4047+            st3  {v10.8b, v11.8b, v12.8b}, [RGB], 24
4048+        .elseif \size == 4
4049+            st3  {v10.b, v11.b, v12.b}[0], [RGB], 3
4050+            st3  {v10.b, v11.b, v12.b}[1], [RGB], 3
4051+            st3  {v10.b, v11.b, v12.b}[2], [RGB], 3
4052+            st3  {v10.b, v11.b, v12.b}[3], [RGB], 3
4053+        .elseif \size == 2
4054+            st3  {v10.b, v11.b, v12.b}[4], [RGB], 3
4055+            st3  {v10.b, v11.b, v12.b}[5], [RGB], 3
4056+        .elseif \size == 1
4057+            st3  {v10.b, v11.b, v12.b}[6], [RGB], 3
4058+        .else
4059+            .error unsupported macroblock size
4060+        .endif
4061+    .elseif \bpp == 32
4062+        .if \size == 8
4063+            st4  {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
4064+        .elseif \size == 4
4065+            st4  {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
4066+            st4  {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
4067+            st4  {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
4068+            st4  {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
4069+        .elseif \size == 2
4070+            st4  {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
4071+            st4  {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
4072+        .elseif \size == 1
4073+            st4  {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
4074+        .else
4075+            .error unsupported macroblock size
4076+        .endif
4077+    .elseif \bpp==16
4078+        .if \size == 8
4079+            st1  {v25.8h}, [RGB],16
4080+        .elseif \size == 4
4081+            st1  {v25.4h}, [RGB],8
4082+        .elseif \size == 2
4083+            st1  {v25.h}[4], [RGB],2
4084+            st1  {v25.h}[5], [RGB],2
4085+        .elseif \size == 1
4086+            st1  {v25.h}[6], [RGB],2
4087+        .else
4088+            .error unsupported macroblock size
4089+        .endif
4090+     .else
4091+        .error unsupported bpp
4092+    .endif
4093+.endm
4094+
4095+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
4096+
4097+/*
4098+ * 2-stage pipelined YCbCr->RGB conversion
4099+ */
4100+
4101+.macro do_yuv_to_rgb_stage1
4102+    uaddw        v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
4103+    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
4104+    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4105+    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4106+    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4107+    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4108+    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4109+    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4110+    smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
4111+    smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
4112+.endm
4113+
4114+.macro do_yuv_to_rgb_stage2
4115+    rshrn        v20.4h, v20.4s, #15
4116+    rshrn2       v20.8h, v22.4s, #15
4117+    rshrn        v24.4h, v24.4s, #14
4118+    rshrn2       v24.8h, v26.4s, #14
4119+    rshrn        v28.4h, v28.4s, #14
4120+    rshrn2       v28.8h, v30.4s, #14
4121+    uaddw        v20.8h, v20.8h, v0.8b
4122+    uaddw        v24.8h, v24.8h, v0.8b
4123+    uaddw        v28.8h, v28.8h, v0.8b
4124+.if \bpp != 16
4125+    sqxtun       v1\g_offs\defsize, v20.8h
4126+    sqxtun       v1\r_offs\defsize, v24.8h
4127+    sqxtun       v1\b_offs\defsize, v28.8h
4128+.else
4129+    sqshlu       v21.8h, v20.8h, #8
4130+    sqshlu       v25.8h, v24.8h, #8
4131+    sqshlu       v29.8h, v28.8h, #8
4132+    sri          v25.8h, v21.8h, #5
4133+    sri          v25.8h, v29.8h, #11
4134+.endif
4135+
4136+.endm
4137+
4138+.macro do_yuv_to_rgb_stage2_store_load_stage1
4139+    rshrn        v20.4h, v20.4s, #15
4140+    rshrn        v24.4h, v24.4s, #14
4141+    rshrn        v28.4h, v28.4s, #14
4142+    ld1          {v4.8b}, [U], 8
4143+    rshrn2       v20.8h, v22.4s, #15
4144+    rshrn2       v24.8h, v26.4s, #14
4145+    rshrn2       v28.8h, v30.4s, #14
4146+    ld1          {v5.8b}, [V], 8
4147+    uaddw        v20.8h, v20.8h, v0.8b
4148+    uaddw        v24.8h, v24.8h, v0.8b
4149+    uaddw        v28.8h, v28.8h, v0.8b
4150+.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
4151+    sqxtun       v1\g_offs\defsize, v20.8h
4152+    ld1          {v0.8b}, [Y], 8
4153+    sqxtun       v1\r_offs\defsize, v24.8h
4154+    prfm         PLDL1KEEP, [U, #64]
4155+    prfm         PLDL1KEEP, [V, #64]
4156+    prfm         PLDL1KEEP, [Y, #64]
4157+    sqxtun       v1\b_offs\defsize, v28.8h
4158+    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
4159+    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
4160+    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4161+    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4162+    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4163+    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4164+    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4165+    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4166+.else /**************************** rgb565 ***********************************/
4167+    sqshlu       v21.8h, v20.8h, #8
4168+    sqshlu       v25.8h, v24.8h, #8
4169+    sqshlu       v29.8h, v28.8h, #8
4170+    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
4171+    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
4172+    ld1          {v0.8b}, [Y], 8
4173+    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4174+    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4175+    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4176+    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4177+    sri          v25.8h, v21.8h, #5
4178+    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4179+    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4180+    prfm         PLDL1KEEP, [U, #64]
4181+    prfm         PLDL1KEEP, [V, #64]
4182+    prfm         PLDL1KEEP, [Y, #64]
4183+    sri          v25.8h, v29.8h, #11
4184+.endif
4185+    do_store     \bpp, 8
4186+    smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
4187+    smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
4188+.endm
4189+
4190+.macro do_yuv_to_rgb
4191+    do_yuv_to_rgb_stage1
4192+    do_yuv_to_rgb_stage2
4193+.endm
4194+
4195+/* Apple gas crashes on adrl, work around that by using adr.
4196+ * But this requires a copy of these constants for each function.
4197+ */
4198+
4199+.balign 16
4200+jsimd_ycc_\colorid\()_neon_consts:
4201+    .short          0,      0,     0,      0
4202+    .short          22971, -11277, -23401, 29033
4203+    .short          -128,  -128,   -128,   -128
4204+    .short          -128,  -128,   -128,   -128
4205+
4206+asm_function jsimd_ycc_\colorid\()_convert_neon
4207+    OUTPUT_WIDTH    .req x0
4208+    INPUT_BUF       .req x1
4209+    INPUT_ROW       .req x2
4210+    OUTPUT_BUF      .req x3
4211+    NUM_ROWS        .req x4
4212+
4213+    INPUT_BUF0      .req x5
4214+    INPUT_BUF1      .req x6
4215+    INPUT_BUF2      .req INPUT_BUF
4216+
4217+    RGB             .req x7
4218+    Y               .req x8
4219+    U               .req x9
4220+    V               .req x10
4221+    N               .req x15
4222+
4223+    sub             sp, sp, 336
4224+    str             x15, [sp], 16
4225+    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
4226+    adr             x15, jsimd_ycc_\colorid\()_neon_consts
4227+    /* Save NEON registers */
4228+    st1             {v0.8b - v3.8b}, [sp], 32
4229+    st1             {v4.8b - v7.8b}, [sp], 32
4230+    st1             {v8.8b - v11.8b}, [sp], 32
4231+    st1             {v12.8b - v15.8b}, [sp], 32
4232+    st1             {v16.8b - v19.8b}, [sp], 32
4233+    st1             {v20.8b - v23.8b}, [sp], 32
4234+    st1             {v24.8b - v27.8b}, [sp], 32
4235+    st1             {v28.8b - v31.8b}, [sp], 32
4236+    ld1             {v0.4h, v1.4h}, [x15], 16
4237+    ld1             {v2.8h}, [x15]
4238+
4239+    /* Save ARM registers and handle input arguments */
4240+    /* push            {x4, x5, x6, x7, x8, x9, x10, x30} */
4241+    stp             x4, x5, [sp], 16
4242+    stp             x6, x7, [sp], 16
4243+    stp             x8, x9, [sp], 16
4244+    stp             x10, x30, [sp], 16
4245+    ldr             INPUT_BUF0, [INPUT_BUF]
4246+    ldr             INPUT_BUF1, [INPUT_BUF, 8]
4247+    ldr             INPUT_BUF2, [INPUT_BUF, 16]
4248+    .unreq          INPUT_BUF
4249+
4250+    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
4251+    movi            v10.16b, #255
4252+    movi            v13.16b, #255
4253+
4254+    /* Outer loop over scanlines */
4255+    cmp             NUM_ROWS, #1
4256+    blt             9f
4257+0:
4258+    lsl             x16, INPUT_ROW, #3
4259+    ldr             Y, [INPUT_BUF0, x16]
4260+    ldr             U, [INPUT_BUF1, x16]
4261+    mov             N, OUTPUT_WIDTH
4262+    ldr             V, [INPUT_BUF2, x16]
4263+    add             INPUT_ROW, INPUT_ROW, #1
4264+    ldr             RGB, [OUTPUT_BUF], #8
4265+
4266+    /* Inner loop over pixels */
4267+    subs            N, N, #8
4268+    blt             3f
4269+    do_load         8
4270+    do_yuv_to_rgb_stage1
4271+    subs            N, N, #8
4272+    blt             2f
4273+1:
4274+    do_yuv_to_rgb_stage2_store_load_stage1
4275+    subs            N, N, #8
4276+    bge             1b
4277+2:
4278+    do_yuv_to_rgb_stage2
4279+    do_store        \bpp, 8
4280+    tst             N, #7
4281+    beq             8f
4282+3:
4283+    tst             N, #4
4284+    beq             3f
4285+    do_load         4
4286+3:
4287+    tst             N, #2
4288+    beq             4f
4289+    do_load         2
4290+4:
4291+    tst             N, #1
4292+    beq             5f
4293+    do_load         1
4294+5:
4295+    do_yuv_to_rgb
4296+    tst             N, #4
4297+    beq             6f
4298+    do_store        \bpp, 4
4299+6:
4300+    tst             N, #2
4301+    beq             7f
4302+    do_store        \bpp, 2
4303+7:
4304+    tst             N, #1
4305+    beq             8f
4306+    do_store        \bpp, 1
4307+8:
4308+    subs            NUM_ROWS, NUM_ROWS, #1
4309+    bgt             0b
4310+9:
4311+    /* Restore all registers and return */
4312+    sub             sp, sp, #336
4313+    ldr             x15, [sp], 16
4314+    ld1             {v0.8b - v3.8b}, [sp], 32
4315+    ld1             {v4.8b - v7.8b}, [sp], 32
4316+    ld1             {v8.8b - v11.8b}, [sp], 32
4317+    ld1             {v12.8b - v15.8b}, [sp], 32
4318+    ld1             {v16.8b - v19.8b}, [sp], 32
4319+    ld1             {v20.8b - v23.8b}, [sp], 32
4320+    ld1             {v24.8b - v27.8b}, [sp], 32
4321+    ld1             {v28.8b - v31.8b}, [sp], 32
4322+    /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
4323+    ldp             x4, x5, [sp], 16
4324+    ldp             x6, x7, [sp], 16
4325+    ldp             x8, x9, [sp], 16
4326+    ldp             x10, x30, [sp], 16
4327+    br              x30
4328+    .unreq          OUTPUT_WIDTH
4329+    .unreq          INPUT_ROW
4330+    .unreq          OUTPUT_BUF
4331+    .unreq          NUM_ROWS
4332+    .unreq          INPUT_BUF0
4333+    .unreq          INPUT_BUF1
4334+    .unreq          INPUT_BUF2
4335+    .unreq          RGB
4336+    .unreq          Y
4337+    .unreq          U
4338+    .unreq          V
4339+    .unreq          N
4340+
4341+.purgem do_yuv_to_rgb
4342+.purgem do_yuv_to_rgb_stage1
4343+.purgem do_yuv_to_rgb_stage2
4344+.purgem do_yuv_to_rgb_stage2_store_load_stage1
4345+.endm
4346+
4347+/*--------------------------------- id ----- bpp R  rsize  G  gsize  B  bsize  defsize   */
4348+generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,   1, .4h,   2, .4h,   .8b
4349+generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,   1, .4h,   0, .4h,   .8b
4350+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,   1, .4h,   2, .4h,   .8b
4351+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h,   .8b
4352+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h,   .8b
4353+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h,   .8b
4354+generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,   0, .4h,   0, .4h,   .8b
4355+.purgem do_load
4356+.purgem do_store
4357