• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1diff --git a/arm/arm_init.c b/arm/arm_init.c
2index 3a89998ab..05aa2c0d9 100644
3--- a/arm/arm_init.c
4+++ b/arm/arm_init.c
5@@ -113,13 +113,23 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp)
6     * initialization function.)
7     */
8    pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon;
9-
10+#ifdef PNG_MULTY_LINE_ENABLE
11+   // OH ISSUE: png optimize
12+   pp->read_filter[PNG_FILTER_VALUE_UP_X2-1] = png_read_filter_row_up_x2_neon;
13+#endif
14    if (bpp == 3)
15    {
16       pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon;
17       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon;
18       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
19          png_read_filter_row_paeth3_neon;
20+#ifdef PNG_MULTY_LINE_ENABLE
21+      // OH ISSUE: png optimize
22+      pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] =
23+         png_read_filter_row_avg3_x2_neon;
24+      pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] =
25+         png_read_filter_row_paeth3_x2_neon;
26+#endif
27    }
28
29    else if (bpp == 4)
30@@ -128,6 +138,13 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp)
31       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon;
32       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
33           png_read_filter_row_paeth4_neon;
34+#ifdef PNG_MULTY_LINE_ENABLE
35+      // OH ISSUE: png optimize
36+      pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] =
37+         png_read_filter_row_avg4_x2_neon;
38+      pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] =
39+         png_read_filter_row_paeth4_x2_neon;
40+#endif
41    }
42 }
43 #endif /* PNG_ARM_NEON_OPT > 0 */
44diff --git a/arm/filter_neon_intrinsics.c b/arm/filter_neon_intrinsics.c
45index 4466d48b2..27048a578 100644
46--- a/arm/filter_neon_intrinsics.c
47+++ b/arm/filter_neon_intrinsics.c
48@@ -47,6 +47,7 @@
49
50 #if PNG_ARM_NEON_OPT > 0
51
52+#ifndef PNG_MULTY_LINE_ENABLE
53 void
54 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
55    png_const_bytep prev_row)
56@@ -396,7 +397,1351 @@ png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
57       vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
58    }
59 }
60+#else
61+// OH ISSUE: png optimize
62+// according to definition: row_info->rowbytes = row_width * row_info->channels,
63+// the input rowbytes must be 3 or 4 times the channel size, so:
64+// for RGB neon process 12 bytes at once,the tail must be 3,6,9;
65+// for RGBA neon process 16 or 8 bytes at once,the tail must be 4;
66+// filter operators are internal function, row_info and row ensure non empty outside.
67+#define STEP_RGB (12) // 3 channel RGB process 12 bytes at once
68+#define TAIL_RGB3 (9) // tail 3 pixels have 9 bytes
69+#define TAIL_RGB2 (6) // tail 2 pixels have 6 bytes
70+#define TAIL_RGB1 (3) // tail 1 pixel have 3 bytes
71+#define STEP_RGBA (16) // GBA neon process 16 bytes at once
72+#define STEP_RGBA_HALF (8) // GBA neon process 8 bytes at once
73+#define TAIL_RGBA (4) // tail 1 pixel have 4 bytes
74+#define IND3 (3) // index 3
75+#define IND2 (2) // index 2
76+#define OFFSET3 (3) // RGB offset 3 bytes
77+#define OFFSET6 (6) // RGB offset 6 bytes
78+void png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
79+   png_const_bytep prev_row)
80+{
81+   png_bytep rp = row;
82+   png_const_bytep pp = prev_row;
83+   int count = row_info->rowbytes;
84+
85+   png_debug(1, "in png_read_filter_row_up_neon");
86+
87+   uint8x16_t qrp, qpp;
88+   while (count >= STEP_RGBA) {
89+      qrp = vld1q_u8(rp);
90+      qpp = vld1q_u8(pp);
91+      qrp = vaddq_u8(qrp, qpp);
92+      vst1q_u8(rp, qrp);
93+      rp += STEP_RGBA;
94+      pp += STEP_RGBA;
95+      count -= STEP_RGBA;
96+   }
97+
98+   if (count >= STEP_RGBA_HALF) {
99+      uint8x8_t qrp1, qpp1;
100+      qrp1 = vld1_u8(rp);
101+      qpp1 = vld1_u8(pp);
102+      qrp1 = vadd_u8(qrp1, qpp1);
103+      vst1_u8(rp, qrp1);
104+      rp += STEP_RGBA_HALF;
105+      pp += STEP_RGBA_HALF;
106+      count -= STEP_RGBA_HALF;
107+   }
108+
109+   for (int i = 0; i < count; i++) {
110+      *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
111+      rp++;
112+   }
113+}
114+
115+void png_read_filter_row_up_x2_neon(png_row_infop row_info, png_bytep row,
116+   png_const_bytep prev_row)
117+{
118+   png_bytep rp = row;
119+   png_const_bytep pp = prev_row;
120+   int count = row_info->rowbytes;
121+   png_bytep np = row + row_info->rowbytes + 1;
122+
123+   png_debug(1, "in png_read_filter_row_up_x2_neon");
124+
125+   uint8x16_t qrp, qpp, qnp;
126+   while (count >= STEP_RGBA) {
127+      qrp = vld1q_u8(rp);
128+      qpp = vld1q_u8(pp);
129+      qnp = vld1q_u8(np);
130+      qrp = vaddq_u8(qrp, qpp);
131+      qnp = vaddq_u8(qnp, qrp);
132+      vst1q_u8(rp, qrp);
133+      vst1q_u8(np, qnp);
134+      rp += STEP_RGBA;
135+      pp += STEP_RGBA;
136+      np += STEP_RGBA;
137+      count -= STEP_RGBA;
138+   }
139+
140+   if (count >= STEP_RGBA_HALF) {
141+      uint8x8_t qrp1, qpp1, qnp1;
142+      qrp1 = vld1_u8(rp);
143+      qpp1 = vld1_u8(pp);
144+      qnp1 = vld1_u8(np);
145+      qrp1 = vadd_u8(qrp1, qpp1);
146+      qnp1 = vadd_u8(qnp1, qrp1);
147+      vst1_u8(rp, qrp1);
148+      vst1_u8(np, qnp1);
149+      rp += STEP_RGBA_HALF;
150+      pp += STEP_RGBA_HALF;
151+      np += STEP_RGBA_HALF;
152+      count -= STEP_RGBA_HALF;
153+   }
154+
155+   for (int i = 0; i < count; i++) {
156+      *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
157+      *np = (png_byte)(((int)(*np) + (int)(*rp++)) & 0xff);
158+      np++;
159+   }
160+}
161+
162+void png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
163+   png_const_bytep prev_row)
164+{
165+   png_bytep rp = row;
166+   png_bytep rp_stop = row + row_info->rowbytes;
167+
168+   uint8x16_t vtmp = vld1q_u8(rp);
169+   uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
170+   uint8x8x2_t vrp = *vrpt;
171+
172+   uint8x8x4_t vdest;
173+   vdest.val[IND3] = vdup_n_u8(0);
174+
175+   uint8x8_t vtmp1, vtmp2;
176+   uint32x2_t *temp_pointer;
177+
178+   png_debug(1, "in png_read_filter_row_sub3_neon");
179+
180+   size_t tail_bytes = row_info->rowbytes % STEP_RGB;
181+   png_byte last_byte = *rp_stop;
182+   png_bytep rp_stop_new = rp_stop - tail_bytes;
183+   for (; rp < rp_stop_new;)
184+   {
185+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
186+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
187+      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
188+      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
189+
190+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
191+      vdest.val[IND2] = vadd_u8(vdest.val[1], vtmp2);
192+      vdest.val[IND3] = vadd_u8(vdest.val[IND2], vtmp1);
193+
194+      vtmp = vld1q_u8(rp + STEP_RGB);
195+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
196+      vrp = *vrpt;
197+
198+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
199+      rp += OFFSET3;
200+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
201+      rp += OFFSET3;
202+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
203+      rp += OFFSET3;
204+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
205+      rp += OFFSET3;
206+   }
207+
208+   if (tail_bytes == TAIL_RGB1) {
209+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
210+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
211+   } else if (tail_bytes == TAIL_RGB2) {
212+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
213+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
214+      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
215+
216+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
217+      rp += OFFSET3;
218+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
219+   } else if (tail_bytes == TAIL_RGB3) {
220+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
221+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
222+      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
223+      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
224+      vdest.val[IND2] = vadd_u8(vdest.val[1], vtmp2);
225+
226+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
227+      rp += OFFSET3;
228+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
229+      rp += OFFSET3;
230+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
231+   }
232+   *rp_stop = last_byte;
233+
234+   PNG_UNUSED(prev_row)
235+}
236+
237+void png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
238+   png_const_bytep prev_row)
239+{
240+   png_bytep rp = row;
241+   int count = row_info->rowbytes;
242+
243+   uint8x8x4_t vdest;
244+   vdest.val[IND3] = vdup_n_u8(0);
245+
246+   png_debug(1, "in png_read_filter_row_sub4_neon");
247+
248+   uint32x2x4_t vtmp;
249+   uint8x8x4_t *vrpt;
250+   uint8x8x4_t vrp;
251+   uint32x2x4_t vdest_val;
252+   while (count >= STEP_RGBA) {
253+      uint32x2x4_t *temp_pointer;
254+      vtmp = vld4_u32(png_ptr(uint32_t, rp));
255+      vrpt = png_ptr(uint8x8x4_t, &vtmp);
256+      vrp = *vrpt;
257+
258+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
259+      vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
260+      vdest.val[IND2] = vadd_u8(vdest.val[1], vrp.val[IND2]);
261+      vdest.val[IND3] = vadd_u8(vdest.val[IND2], vrp.val[IND3]);
262+
263+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
264+      vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
265+
266+      rp += STEP_RGBA;
267+      count -= STEP_RGBA;
268+   }
269+
270+   if (count >= STEP_RGBA_HALF) {
271+      uint32x2x2_t vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
272+      uint8x8x2_t *vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
273+      uint8x8x2_t vrp1 = *vrpt1;
274+      uint32x2x2_t *temp_pointer;
275+      uint32x2x2_t vdest_val1;
276+
277+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp1.val[0]);
278+      vdest.val[1] = vadd_u8(vdest.val[0], vrp1.val[1]);
279+      vdest.val[IND3] = vdest.val[1];
280+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
281+      vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
282+
283+      rp += STEP_RGBA_HALF;
284+      count -= STEP_RGBA_HALF;
285+   }
286+
287+   if (count == 0) {
288+      return;
289+   }
290+
291+   uint32x2_t vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
292+   uint8x8_t *vrpt2 = png_ptr(uint8x8_t, &vtmp2);
293+   uint8x8_t vrp2 = *vrpt2;
294+   uint32x2_t *temp_pointer;
295+   uint32x2_t vdest_val2;
296+
297+   vdest.val[0] = vadd_u8(vdest.val[IND3], vrp2);
298+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
299+   vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
300+
301+   PNG_UNUSED(prev_row)
302+}
303+
304+void png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
305+   png_const_bytep prev_row)
306+{
307+   png_bytep rp = row;
308+   png_const_bytep pp = prev_row;
309+   png_bytep rp_stop = row + row_info->rowbytes;
310+
311+   uint8x16_t vtmp;
312+   uint8x8x2_t *vrpt;
313+   uint8x8x2_t vrp;
314+   uint8x8x4_t vdest;
315+   vdest.val[IND3] = vdup_n_u8(0);
316+
317+   vtmp = vld1q_u8(rp);
318+   vrpt = png_ptr(uint8x8x2_t, &vtmp);
319+   vrp = *vrpt;
320+
321+   png_debug(1, "in png_read_filter_row_avg3_neon");
322+
323+   uint8x8_t vtmp1, vtmp2, vtmp3;
324+   uint8x8x2_t *vppt;
325+   uint8x8x2_t vpp;
326+   uint32x2_t *temp_pointer;
327+
328+   size_t tail_bytes = row_info->rowbytes % STEP_RGB;
329+   png_byte last_byte = *rp_stop;
330+   png_bytep rp_stop_new = rp_stop - tail_bytes;
331+   for (; rp < rp_stop_new; pp += STEP_RGB)
332+   {
333+      vtmp = vld1q_u8(pp);
334+      vppt = png_ptr(uint8x8x2_t, &vtmp);
335+      vpp = *vppt;
336+
337+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
338+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
339+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
340+
341+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
342+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
343+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
344+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
345+
346+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
347+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
348+
349+      vtmp = vld1q_u8(rp + STEP_RGB);
350+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
351+      vrp = *vrpt;
352+
353+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
354+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
355+
356+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
357+
358+      vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vtmp2);
359+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
360+
361+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
362+      rp += OFFSET3;
363+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
364+      rp += OFFSET3;
365+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
366+      rp += OFFSET3;
367+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
368+      rp += OFFSET3;
369+   }
370+
371+   vtmp = vld1q_u8(pp);
372+   vppt = png_ptr(uint8x8x2_t, &vtmp);
373+   vpp = *vppt;
374+
375+   if (tail_bytes == TAIL_RGB1) {
376+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
377+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
378+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
379+   } else if (tail_bytes == TAIL_RGB2) {
380+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
381+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
382+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
383+
384+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
385+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
386+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
387+
388+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
389+      rp += OFFSET3;
390+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
391+   } else if (tail_bytes == TAIL_RGB3) {
392+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
393+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
394+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
395+
396+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
397+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
398+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
399+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
400+
401+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
402+
403+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
404+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
405+
406+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
407+      rp += OFFSET3;
408+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
409+      rp += OFFSET3;
410+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
411+   }
412+   *rp_stop = last_byte;
413+}
414+
415+void png_read_filter_row_avg3_x2_neon(png_row_infop row_info, png_bytep row,
416+   png_const_bytep prev_row)
417+{
418+   png_bytep rp = row;
419+   png_const_bytep pp = prev_row;
420+   png_bytep rp_stop = row + row_info->rowbytes;
421+   png_bytep np = rp_stop + 1;
422+
423+   uint8x16_t vtmp;
424+   uint8x8x2_t *vrpt;
425+   uint8x8x2_t vrp;
426+   uint8x8x4_t vdest;
427+   vdest.val[IND3] = vdup_n_u8(0);
428+
429+   vtmp = vld1q_u8(rp);
430+   vrpt = png_ptr(uint8x8x2_t, &vtmp);
431+   vrp = *vrpt;
432+
433+   uint8x8x2_t *vnpt;
434+   uint8x8x2_t vnp;
435+   uint8x8x4_t vdestN;
436+   vdestN.val[IND3] = vdup_n_u8(0);
437+
438+   vtmp = vld1q_u8(np);
439+   vnpt = png_ptr(uint8x8x2_t, &vtmp);
440+   vnp = *vnpt;
441+
442+   png_debug(1, "in png_read_filter_row_x2_avg3_neon");
443+
444+   uint8x8_t vtmp1, vtmp2, vtmp3;
445+   uint8x8x2_t *vppt;
446+   uint8x8x2_t vpp;
447+   uint32x2_t *temp_pointer;
448+
449+   size_t tail_bytes = row_info->rowbytes % STEP_RGB;
450+   png_byte last_byte = *rp_stop;
451+   png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1);
452+   png_bytep rp_stop_new = rp_stop - tail_bytes;
453+   for (; rp < rp_stop_new; pp += STEP_RGB)
454+   {
455+      vtmp = vld1q_u8(pp);
456+      vppt = png_ptr(uint8x8x2_t, &vtmp);
457+      vpp = *vppt;
458+
459+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
460+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
461+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
462+
463+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
464+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
465+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
466+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
467+
468+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
469+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
470+
471+      vtmp = vld1q_u8(rp + STEP_RGB);
472+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
473+      vrp = *vrpt;
474+
475+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
476+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
477+
478+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
479+
480+      vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vtmp2);
481+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
482+
483+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
484+      rp += OFFSET3;
485+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
486+      rp += OFFSET3;
487+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
488+      rp += OFFSET3;
489+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
490+      rp += OFFSET3;
491+
492+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
493+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
494+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
495+
496+      vtmp3 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
497+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
498+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
499+
500+      vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1);
501+
502+      vtmp = vld1q_u8(np + STEP_RGB);
503+      vnpt = png_ptr(uint8x8x2_t, &vtmp);
504+      vnp = *vnpt;
505+
506+      vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]);
507+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp3);
508+
509+      vdestN.val[IND3] = vhadd_u8(vdestN.val[IND2], vdest.val[IND3]);
510+      vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vtmp1);
511+
512+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
513+      np += OFFSET3;
514+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
515+      np += OFFSET3;
516+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
517+      np += OFFSET3;
518+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND3]), 0);
519+      np += OFFSET3;
520+   }
521+
522+   vtmp = vld1q_u8(pp);
523+   vppt = png_ptr(uint8x8x2_t, &vtmp);
524+   vpp = *vppt;
525+
526+   if (tail_bytes == TAIL_RGB1) {
527+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
528+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
529+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
530+
531+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
532+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
533+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
534+   } else if (tail_bytes == TAIL_RGB2) {
535+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
536+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
537+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
538+
539+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
540+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
541+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
542+
543+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
544+      rp += OFFSET3;
545+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
546+
547+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
548+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
549+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
550+
551+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
552+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
553+
554+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
555+      np += OFFSET3;
556+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
557+   } else if (tail_bytes == TAIL_RGB3) {
558+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
559+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
560+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
561+
562+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
563+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
564+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
565+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
566+
567+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
568+
569+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
570+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
571+
572+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
573+      rp += OFFSET3;
574+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
575+      rp += OFFSET3;
576+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
577+
578+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
579+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
580+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
581+
582+      vtmp3 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
583+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
584+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
585+
586+      vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]);
587+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp3);
588+
589+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
590+      np += OFFSET3;
591+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
592+      np += OFFSET3;
593+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
594+   }
595+   *rp_stop = last_byte;
596+   *(rp_stop + row_info->rowbytes + 1) = last_byte_next;
597+}
598+
599+void png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
600+   png_const_bytep prev_row)
601+{
602+   png_bytep rp = row;
603+   png_const_bytep pp = prev_row;
604+   int count = row_info->rowbytes;
605+
606+   uint8x8x4_t vdest;
607+   vdest.val[IND3] = vdup_n_u8(0);
608+
609+   png_debug(1, "in png_read_filter_row_avg4_neon");
610+
611+   uint32x2x4_t vtmp;
612+   uint8x8x4_t *vrpt, *vppt;
613+   uint8x8x4_t vrp, vpp;
614+   uint32x2x4_t vdest_val;
615+   while (count >= STEP_RGBA) {
616+      uint32x2x4_t *temp_pointer;
617+      vtmp = vld4_u32(png_ptr(uint32_t, rp));
618+      vrpt = png_ptr(uint8x8x4_t, &vtmp);
619+      vrp = *vrpt;
620+      vtmp = vld4_u32(png_ptrc(uint32_t, pp));
621+      vppt = png_ptr(uint8x8x4_t, &vtmp);
622+      vpp = *vppt;
623+
624+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
625+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
626+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
627+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
628+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vpp.val[IND2]);
629+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
630+      vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vpp.val[IND3]);
631+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
632+
633+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
634+      vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
635+
636+      rp += STEP_RGBA;
637+      pp += STEP_RGBA;
638+      count -= STEP_RGBA;
639+   }
640+
641+   if (count >= STEP_RGBA_HALF) {
642+      uint32x2x2_t vtmp1;
643+      uint8x8x2_t *vrpt1, *vppt1;
644+      uint8x8x2_t vrp1, vpp1;
645+      uint32x2x2_t *temp_pointer;
646+      uint32x2x2_t vdest_val1;
647+
648+      vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
649+      vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
650+      vrp1 = *vrpt1;
651+      vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
652+      vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
653+      vpp1 = *vppt1;
654+
655+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp1.val[0]);
656+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
657+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]);
658+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
659+      vdest.val[IND3] = vdest.val[1];
660+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
661+      vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
662+
663+      rp += STEP_RGBA_HALF;
664+      pp += STEP_RGBA_HALF;
665+      count -= STEP_RGBA_HALF;
666+   }
667+
668+   if (count == 0) {
669+      return;
670+   }
671+
672+   uint32x2_t vtmp2;
673+   uint8x8_t *vrpt2, *vppt2;
674+   uint8x8_t vrp2, vpp2;
675+   uint32x2_t *temp_pointer;
676+   uint32x2_t vdest_val2;
677+
678+   vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
679+   vrpt2 = png_ptr(uint8x8_t, &vtmp2);
680+   vrp2 = *vrpt2;
681+   vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
682+   vppt2 = png_ptr(uint8x8_t, &vtmp2);
683+   vpp2 = *vppt2;
684+
685+   vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp2);
686+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
687+
688+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
689+   vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
690+}
691
692+void png_read_filter_row_avg4_x2_neon(png_row_infop row_info, png_bytep row,
693+   png_const_bytep prev_row)
694+{
695+   png_bytep rp = row;
696+   png_const_bytep pp = prev_row;
697+   int count = row_info->rowbytes;
698+   png_bytep np = row + count + 1;
699+
700+   uint8x8x4_t vdest;
701+   vdest.val[IND3] = vdup_n_u8(0);
702+
703+   png_debug(1, "in png_read_filter_row_avg4_x2_neon");
704+
705+   uint32x2x4_t vtmp;
706+   uint8x8x4_t *vrpt, *vppt;
707+   uint8x8x4_t vrp, vpp;
708+   uint32x2x4_t vdest_val;
709+
710+   uint8x8x4_t *vnpt;
711+   uint8x8x4_t vnp;
712+   uint8x8x4_t vdestN;
713+   vdestN.val[IND3] = vdup_n_u8(0);
714+
715+   while (count >= STEP_RGBA) {
716+      uint32x2x4_t *temp_pointer;
717+      vtmp = vld4_u32(png_ptr(uint32_t, rp));
718+      vrpt = png_ptr(uint8x8x4_t, &vtmp);
719+      vrp = *vrpt;
720+      vtmp = vld4_u32(png_ptrc(uint32_t, pp));
721+      vppt = png_ptr(uint8x8x4_t, &vtmp);
722+      vpp = *vppt;
723+      vtmp = vld4_u32(png_ptrc(uint32_t, np));
724+      vnpt = png_ptr(uint8x8x4_t, &vtmp);
725+      vnp = *vnpt;
726+
727+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
728+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
729+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
730+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
731+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vpp.val[IND2]);
732+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
733+      vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vpp.val[IND3]);
734+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
735+
736+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
737+      vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
738+
739+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
740+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
741+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
742+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]);
743+      vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]);
744+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vnp.val[IND2]);
745+      vdestN.val[IND3] = vhadd_u8(vdestN.val[IND2], vdest.val[IND3]);
746+      vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vnp.val[IND3]);
747+
748+      vdest_val = png_ldr(uint32x2x4_t, &vdestN);
749+      vst4_lane_u32(png_ptr(uint32_t, np), vdest_val, 0);
750+
751+      rp += STEP_RGBA;
752+      pp += STEP_RGBA;
753+      np += STEP_RGBA;
754+      count -= STEP_RGBA;
755+   }
756+
757+   if (count >= STEP_RGBA_HALF) {
758+      uint32x2x2_t vtmp1;
759+      uint8x8x2_t *vrpt1, *vppt1, *vnpt1;
760+      uint8x8x2_t vrp1, vpp1, vnp1;
761+      uint32x2x2_t *temp_pointer;
762+      uint32x2x2_t vdest_val1;
763+
764+      vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
765+      vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
766+      vrp1 = *vrpt1;
767+      vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
768+      vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
769+      vpp1 = *vppt1;
770+      vtmp1 = vld2_u32(png_ptrc(uint32_t, np));
771+      vnpt1 = png_ptr(uint8x8x2_t, &vtmp1);
772+      vnp1 = *vnpt1;
773+
774+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp1.val[0]);
775+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
776+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]);
777+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
778+      vdest.val[IND3] = vdest.val[1];
779+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
780+      vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
781+
782+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
783+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]);
784+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
785+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]);
786+      vdestN.val[IND3] = vdestN.val[1];
787+      vdest_val1 = png_ldr(uint32x2x2_t, &vdestN);
788+      vst2_lane_u32(png_ptr(uint32_t, np), vdest_val1, 0);
789+
790+      rp += STEP_RGBA_HALF;
791+      pp += STEP_RGBA_HALF;
792+      np += STEP_RGBA_HALF;
793+      count -= STEP_RGBA_HALF;
794+   }
795+
796+   if (count == 0) {
797+      return;
798+   }
799+
800+   uint32x2_t vtmp2;
801+   uint8x8_t *vrpt2, *vppt2, *vnpt2;
802+   uint8x8_t vrp2, vpp2, vnp2;
803+   uint32x2_t *temp_pointer;
804+   uint32x2_t vdest_val2;
805+
806+   vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
807+   vrpt2 = png_ptr(uint8x8_t, &vtmp2);
808+   vrp2 = *vrpt2;
809+   vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
810+   vppt2 = png_ptr(uint8x8_t, &vtmp2);
811+   vpp2 = *vppt2;
812+   vtmp2 = vld1_u32(png_ptrc(uint32_t, np));
813+   vnpt2 = png_ptr(uint8x8_t, &vtmp2);
814+   vnp2 = *vnpt2;
815+
816+   vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp2);
817+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
818+
819+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
820+   vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
821+
822+   vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
823+   vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2);
824+
825+   vdest_val2 = png_ldr(uint32x2_t, &vdestN);
826+   vst1_lane_u32(png_ptr(uint32_t, np), vdest_val2, 0);
827+}
828+
829+static uint8x8_t paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
830+{
831+   uint8x8_t d, e;
832+   uint16x8_t p1, pa, pb, pc;
833+
834+   p1 = vaddl_u8(a, b); /* a + b */
835+   pc = vaddl_u8(c, c); /* c * 2 */
836+   pa = vabdl_u8(b, c); /* pa */
837+   pb = vabdl_u8(a, c); /* pb */
838+   pc = vabdq_u16(p1, pc); /* pc */
839+
840+   p1 = vcleq_u16(pa, pb); /* pa <= pb */
841+   pa = vcleq_u16(pa, pc); /* pa <= pc */
842+   pb = vcleq_u16(pb, pc); /* pb <= pc */
843+
844+   p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
845+
846+   d = vmovn_u16(pb);
847+   e = vmovn_u16(p1);
848+
849+   d = vbsl_u8(d, b, c);
850+   e = vbsl_u8(e, a, d);
851+
852+   return e;
853+}
854+
855+void png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
856+   png_const_bytep prev_row)
857+{
858+   png_bytep rp = row;
859+   png_const_bytep pp = prev_row;
860+   png_bytep rp_stop = row + row_info->rowbytes;
861+
862+   uint8x16_t vtmp;
863+   uint8x8x2_t *vrpt;
864+   uint8x8x2_t vrp;
865+   uint8x8_t vlast = vdup_n_u8(0);
866+   uint8x8x4_t vdest;
867+   vdest.val[IND3] = vdup_n_u8(0);
868+
869+   vtmp = vld1q_u8(rp);
870+   vrpt = png_ptr(uint8x8x2_t, &vtmp);
871+   vrp = *vrpt;
872+
873+   uint8x8x2_t *vppt;
874+   uint8x8x2_t vpp;
875+   uint8x8_t vtmp1, vtmp2, vtmp3;
876+   uint32x2_t *temp_pointer;
877+
878+   png_debug(1, "in png_read_filter_row_paeth3_neon");
879+
880+   size_t tail_bytes = row_info->rowbytes % STEP_RGB;
881+   png_byte last_byte = *rp_stop;
882+   png_bytep rp_stop_new = rp_stop - tail_bytes;
883+   for (; rp < rp_stop_new; pp += STEP_RGB)
884+   {
885+      vtmp = vld1q_u8(pp);
886+      vppt = png_ptr(uint8x8x2_t, &vtmp);
887+      vpp = *vppt;
888+
889+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
890+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
891+
892+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
893+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
894+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
895+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
896+
897+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
898+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
899+      vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
900+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
901+
902+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
903+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
904+
905+      vtmp = vld1q_u8(rp + STEP_RGB);
906+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
907+      vrp = *vrpt;
908+
909+      vdest.val[IND3] = paeth(vdest.val[IND2], vtmp2, vtmp3);
910+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
911+
912+      vlast = vtmp2;
913+
914+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
915+      rp += OFFSET3;
916+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
917+      rp += OFFSET3;
918+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
919+      rp += OFFSET3;
920+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
921+      rp += OFFSET3;
922+   }
923+
924+   vtmp = vld1q_u8(pp);
925+   vppt = png_ptr(uint8x8x2_t, &vtmp);
926+   vpp = *vppt;
927+
928+   if (tail_bytes == TAIL_RGB1) {
929+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
930+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
931+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
932+   } else if (tail_bytes == TAIL_RGB2) {
933+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
934+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
935+
936+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
937+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
938+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
939+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
940+
941+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
942+      rp += OFFSET3;
943+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
944+   } else if (tail_bytes == TAIL_RGB3) {
945+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
946+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
947+
948+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
949+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
950+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
951+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
952+
953+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
954+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
955+      vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
956+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
957+
958+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
959+      rp += OFFSET3;
960+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
961+      rp += OFFSET3;
962+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
963+   }
964+   *rp_stop = last_byte;
965+}
966+
967+void png_read_filter_row_paeth3_x2_neon(png_row_infop row_info, png_bytep row,
968+   png_const_bytep prev_row)
969+{
970+   png_bytep rp = row;
971+   png_const_bytep pp = prev_row;
972+   png_bytep rp_stop = row + row_info->rowbytes;
973+   png_bytep np = rp_stop + 1;
974+
975+   uint8x16_t vtmp;
976+   uint8x8x2_t *vrpt;
977+   uint8x8x2_t vrp;
978+   uint8x8_t vlast = vdup_n_u8(0);
979+   uint8x8x4_t vdest;
980+   vdest.val[IND3] = vdup_n_u8(0);
981+
982+   vtmp = vld1q_u8(rp);
983+   vrpt = png_ptr(uint8x8x2_t, &vtmp);
984+   vrp = *vrpt;
985+
986+   uint8x8x2_t *vppt;
987+   uint8x8x2_t vpp;
988+   uint8x8_t vtmp1, vtmp2, vtmp3;
989+   uint32x2_t *temp_pointer;
990+
991+   uint8x8x2_t *vnpt;
992+   uint8x8x2_t vnp;
993+   uint8x8_t vlastN = vdup_n_u8(0);
994+   uint8x8x4_t vdestN;
995+   vdestN.val[IND3] = vdup_n_u8(0);
996+
997+   vtmp = vld1q_u8(np);
998+   vnpt = png_ptr(uint8x8x2_t, &vtmp);
999+   vnp = *vnpt;
1000+
1001+   png_debug(1, "in png_read_filter_row_paeth3_x2_neon");
1002+
1003+   size_t tail_bytes = row_info->rowbytes % STEP_RGB;
1004+   png_byte last_byte = *rp_stop;
1005+   png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1);
1006+   png_bytep rp_stop_new = rp_stop - tail_bytes;
1007+
1008+   for (; rp < rp_stop_new; pp += STEP_RGB)
1009+   {
1010+      vtmp = vld1q_u8(pp);
1011+      vppt = png_ptr(uint8x8x2_t, &vtmp);
1012+      vpp = *vppt;
1013+
1014+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1015+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1016+
1017+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
1018+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
1019+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
1020+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
1021+
1022+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
1023+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
1024+      vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
1025+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
1026+
1027+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
1028+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
1029+
1030+      vtmp = vld1q_u8(rp + STEP_RGB);
1031+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
1032+      vrp = *vrpt;
1033+
1034+      vdest.val[IND3] = paeth(vdest.val[IND2], vtmp2, vtmp3);
1035+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
1036+
1037+      vlast = vtmp2;
1038+
1039+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
1040+      rp += OFFSET3;
1041+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
1042+      rp += OFFSET3;
1043+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
1044+      rp += OFFSET3;
1045+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
1046+      rp += OFFSET3;
1047+
1048+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1049+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1050+
1051+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
1052+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1053+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
1054+
1055+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
1056+      vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]);
1057+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp1);
1058+
1059+      vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1);
1060+
1061+      vtmp = vld1q_u8(np + STEP_RGB);
1062+      vnpt = png_ptr(uint8x8x2_t, &vtmp);
1063+      vnp = *vnpt;
1064+
1065+      vdestN.val[IND3] = paeth(vdestN.val[IND2], vdest.val[IND3], vdest.val[IND2]);
1066+      vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vtmp1);
1067+
1068+      vlastN = vdest.val[IND3];
1069+
1070+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
1071+      np += OFFSET3;
1072+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
1073+      np += OFFSET3;
1074+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
1075+      np += OFFSET3;
1076+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND3]), 0);
1077+      np += OFFSET3;
1078+   }
1079+
1080+   vtmp = vld1q_u8(pp);
1081+   vppt = png_ptr(uint8x8x2_t, &vtmp);
1082+   vpp = *vppt;
1083+
1084+   if (tail_bytes == TAIL_RGB1) {
1085+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1086+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1087+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
1088+
1089+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1090+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1091+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
1092+   } else if (tail_bytes == TAIL_RGB2) {
1093+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1094+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1095+
1096+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
1097+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
1098+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
1099+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
1100+
1101+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
1102+      rp += OFFSET3;
1103+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
1104+
1105+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1106+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1107+
1108+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
1109+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1110+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
1111+
1112+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
1113+      np += OFFSET3;
1114+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
1115+   } else if (tail_bytes == TAIL_RGB3) {
1116+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1117+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1118+
1119+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
1120+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
1121+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
1122+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
1123+
1124+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
1125+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
1126+      vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
1127+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
1128+
1129+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
1130+      rp += OFFSET3;
1131+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
1132+      rp += OFFSET3;
1133+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
1134+
1135+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1136+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1137+
1138+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
1139+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1140+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
1141+
1142+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
1143+      vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]);
1144+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp1);
1145+
1146+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
1147+      np += OFFSET3;
1148+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
1149+      np += OFFSET3;
1150+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
1151+   }
1152+   *rp_stop = last_byte;
1153+   *(rp_stop + row_info->rowbytes + 1) = last_byte_next;
1154+}
1155+
1156+void png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
1157+   png_const_bytep prev_row)
1158+{
1159+   png_bytep rp = row;
1160+   int count = row_info->rowbytes;
1161+   png_const_bytep pp = prev_row;
1162+
1163+   uint8x8_t vlast = vdup_n_u8(0);
1164+   uint8x8x4_t vdest;
1165+   vdest.val[IND3] = vdup_n_u8(0);
1166+
1167+   png_debug(1, "in png_read_filter_row_paeth4_neon");
1168+
1169+   uint32x2x4_t vtmp;
1170+   uint8x8x4_t *vrpt, *vppt;
1171+   uint8x8x4_t vrp, vpp;
1172+   uint32x2x4_t vdest_val;
1173+   while (count >= STEP_RGBA) {
1174+      uint32x2x4_t *temp_pointer;
1175+      vtmp = vld4_u32(png_ptr(uint32_t, rp));
1176+      vrpt = png_ptr(uint8x8x4_t, &vtmp);
1177+      vrp = *vrpt;
1178+      vtmp = vld4_u32(png_ptrc(uint32_t, pp));
1179+      vppt = png_ptr(uint8x8x4_t, &vtmp);
1180+      vpp = *vppt;
1181+
1182+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1183+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1184+      vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
1185+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
1186+      vdest.val[IND2] = paeth(vdest.val[1], vpp.val[IND2], vpp.val[1]);
1187+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
1188+      vdest.val[IND3] = paeth(vdest.val[IND2], vpp.val[IND3], vpp.val[IND2]);
1189+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
1190+
1191+      vlast = vpp.val[IND3];
1192+
1193+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
1194+      vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
1195+
1196+      rp += STEP_RGBA;
1197+      pp += STEP_RGBA;
1198+      count -= STEP_RGBA;
1199+   }
1200+
1201+   if (count >= STEP_RGBA_HALF) {
1202+      uint32x2x2_t vtmp1;
1203+      uint8x8x2_t *vrpt1, *vppt1;
1204+      uint8x8x2_t vrp1, vpp1;
1205+      uint32x2x2_t *temp_pointer;
1206+      uint32x2x2_t vdest_val1;
1207+
1208+      vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
1209+      vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
1210+      vrp1 = *vrpt1;
1211+      vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
1212+      vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
1213+      vpp1 = *vppt1;
1214+
1215+      vdest.val[0] = paeth(vdest.val[IND3], vpp1.val[0], vlast);
1216+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
1217+      vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]);
1218+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
1219+      vlast = vpp1.val[1];
1220+
1221+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
1222+      vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
1223+      vdest.val[IND3] = vdest.val[1];
1224+
1225+      rp += STEP_RGBA_HALF;
1226+      pp += STEP_RGBA_HALF;
1227+      count -= STEP_RGBA_HALF;
1228+   }
1229+
1230+   if (count == 0) {
1231+      return;
1232+   }
1233+
1234+   uint32x2_t vtmp2;
1235+   uint8x8_t *vrpt2, *vppt2;
1236+   uint8x8_t vrp2, vpp2;
1237+   uint32x2_t *temp_pointer;
1238+   uint32x2_t vdest_val2;
1239+
1240+   vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
1241+   vrpt2 = png_ptr(uint8x8_t, &vtmp2);
1242+   vrp2 = *vrpt2;
1243+   vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
1244+   vppt2 = png_ptr(uint8x8_t, &vtmp2);
1245+   vpp2 = *vppt2;
1246+
1247+   vdest.val[0] = paeth(vdest.val[IND3], vpp2, vlast);
1248+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
1249+
1250+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
1251+   vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
1252+}
1253+
1254+void png_read_filter_row_paeth4_x2_neon(png_row_infop row_info, png_bytep row,
1255+   png_const_bytep prev_row)
1256+{
1257+   png_bytep rp = row;
1258+   int count = row_info->rowbytes;
1259+   png_const_bytep pp = prev_row;
1260+   png_bytep np = row + row_info->rowbytes + 1;
1261+
1262+   uint8x8_t vlast = vdup_n_u8(0);
1263+   uint8x8x4_t vdest;
1264+   vdest.val[IND3] = vdup_n_u8(0);
1265+
1266+   png_debug(1, "in png_read_filter_row_paeth4_x2_neon");
1267+
1268+   uint32x2x4_t vtmp;
1269+   uint8x8x4_t *vrpt, *vppt;
1270+   uint8x8x4_t vrp, vpp;
1271+   uint32x2x4_t vdest_val;
1272+
1273+   uint8x8x4_t *vnpt;
1274+   uint8x8x4_t vnp;
1275+   uint8x8_t vlastN = vdup_n_u8(0);
1276+   uint8x8x4_t vdestN;
1277+   vdestN.val[IND3] = vdup_n_u8(0);
1278+
1279+   while (count >= STEP_RGBA) {
1280+      uint32x2x4_t *temp_pointer;
1281+      vtmp = vld4_u32(png_ptr(uint32_t, rp));
1282+      vrpt = png_ptr(uint8x8x4_t, &vtmp);
1283+      vrp = *vrpt;
1284+      vtmp = vld4_u32(png_ptrc(uint32_t, pp));
1285+      vppt = png_ptr(uint8x8x4_t, &vtmp);
1286+      vpp = *vppt;
1287+      vtmp = vld4_u32(png_ptrc(uint32_t, np));
1288+      vnpt = png_ptr(uint8x8x4_t, &vtmp);
1289+      vnp = *vnpt;
1290+
1291+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1292+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1293+      vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
1294+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
1295+      vdest.val[IND2] = paeth(vdest.val[1], vpp.val[IND2], vpp.val[1]);
1296+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
1297+      vdest.val[IND3] = paeth(vdest.val[IND2], vpp.val[IND3], vpp.val[IND2]);
1298+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
1299+
1300+      vlast = vpp.val[IND3];
1301+
1302+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
1303+      vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
1304+
1305+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1306+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1307+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1308+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]);
1309+      vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]);
1310+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vnp.val[IND2]);
1311+      vdestN.val[IND3] = paeth(vdestN.val[IND2], vdest.val[IND3], vdest.val[IND2]);
1312+      vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vnp.val[IND3]);
1313+
1314+      vlastN = vdest.val[IND3];
1315+
1316+      vdest_val = png_ldr(uint32x2x4_t, &vdestN);
1317+      vst4_lane_u32(png_ptr(uint32_t, np), vdest_val, 0);
1318+
1319+      rp += STEP_RGBA;
1320+      pp += STEP_RGBA;
1321+      np += STEP_RGBA;
1322+      count -= STEP_RGBA;
1323+   }
1324+
1325+   if (count >= STEP_RGBA_HALF) {
1326+      uint32x2x2_t vtmp1;
1327+      uint8x8x2_t *vrpt1, *vppt1, *vnpt1;
1328+      uint8x8x2_t vrp1, vpp1, vnp1;
1329+      uint32x2x2_t *temp_pointer;
1330+      uint32x2x2_t vdest_val1;
1331+
1332+      vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
1333+      vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
1334+      vrp1 = *vrpt1;
1335+      vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
1336+      vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
1337+      vpp1 = *vppt1;
1338+      vtmp1 = vld2_u32(png_ptrc(uint32_t, np));
1339+      vnpt1 = png_ptr(uint8x8x2_t, &vtmp1);
1340+      vnp1 = *vnpt1;
1341+
1342+      vdest.val[0] = paeth(vdest.val[IND3], vpp1.val[0], vlast);
1343+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
1344+      vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]);
1345+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
1346+
1347+      vlast = vpp1.val[1];
1348+
1349+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
1350+      vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
1351+
1352+      vdest.val[IND3] = vdest.val[1];
1353+
1354+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1355+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]);
1356+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1357+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]);
1358+
1359+      vlastN = vdest.val[1];
1360+
1361+      vdest_val1 = png_ldr(uint32x2x2_t, &vdestN);
1362+      vst2_lane_u32(png_ptr(uint32_t, np), vdest_val1, 0);
1363+
1364+      vdestN.val[IND3] = vdestN.val[1];
1365+
1366+      rp += STEP_RGBA_HALF;
1367+      pp += STEP_RGBA_HALF;
1368+      np += STEP_RGBA_HALF;
1369+      count -= STEP_RGBA_HALF;
1370+   }
1371+
1372+   if (count == 0) {
1373+      return;
1374+   }
1375+
1376+   uint32x2_t vtmp2;
1377+   uint8x8_t *vrpt2, *vppt2, *vnpt2;
1378+   uint8x8_t vrp2, vpp2, vnp2;
1379+   uint32x2_t *temp_pointer;
1380+   uint32x2_t vdest_val2;
1381+
1382+   vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
1383+   vrpt2 = png_ptr(uint8x8_t, &vtmp2);
1384+   vrp2 = *vrpt2;
1385+   vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
1386+   vppt2 = png_ptr(uint8x8_t, &vtmp2);
1387+   vpp2 = *vppt2;
1388+   vtmp2 = vld1_u32(png_ptrc(uint32_t, np));
1389+   vnpt2 = png_ptr(uint8x8_t, &vtmp2);
1390+   vnp2 = *vnpt2;
1391+
1392+   vdest.val[0] = paeth(vdest.val[IND3], vpp2, vlast);
1393+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
1394+
1395+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
1396+   vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
1397+
1398+   vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1399+   vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2);
1400+
1401+   vdest_val2 = png_ldr(uint32x2_t, &vdestN);
1402+   vst1_lane_u32(png_ptr(uint32_t, np), vdest_val2, 0);
1403+}
1404+#endif /* PNG_MULTY_LINE_ENABLE */
1405 #endif /* PNG_ARM_NEON_OPT > 0 */
1406 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
1407 #endif /* READ */
1408diff --git a/pngpread.c b/pngpread.c
1409index e283627b7..43ec512df 100644
1410--- a/pngpread.c
1411+++ b/pngpread.c
1412@@ -264,9 +264,22 @@ png_push_read_chunk(png_structrp png_ptr, png_inforp info_ptr)
1413       png_ptr->idat_size = png_ptr->push_length;
1414       png_ptr->process_mode = PNG_READ_IDAT_MODE;
1415       png_push_have_info(png_ptr, info_ptr);
1416-      png_ptr->zstream.avail_out =
1417-          (uInt) PNG_ROWBYTES(png_ptr->pixel_depth,
1418-          png_ptr->iwidth) + 1;
1419+#ifdef PNG_MULTY_LINE_ENABLE
1420+      // OH ISSUE: png optimize
1421+      if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1422+         (png_ptr->transformations & PNG_CHECK) == 0) {
1423+         int rest = png_ptr->num_rows - png_ptr->row_number;
1424+         int row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
1425+         png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
1426+             png_ptr->iwidth) + 1) * row_num;
1427+      }
1428+      else
1429+#endif
1430+      {
1431+         png_ptr->zstream.avail_out =
1432+            (uInt) PNG_ROWBYTES(png_ptr->pixel_depth,
1433+            png_ptr->iwidth) + 1;
1434+      }
1435       png_ptr->zstream.next_out = png_ptr->row_buf;
1436       return;
1437    }
1438@@ -623,6 +636,92 @@ png_push_read_IDAT(png_structrp png_ptr)
1439    }
1440 }
1441
1442+#ifdef PNG_MULTY_LINE_ENABLE
1443+// OH ISSUE: png optimize
1444+static void png_push_process_row_x2(png_structrp png_ptr,
1445+   png_row_info row_info_in)
1446+{
1447+   png_debug(1, "in png_push_process_row_x2");
1448+   png_row_info row_info = row_info_in;
1449+   png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
1450+      png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4);
1451+
1452+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1453+   if (png_ptr->transformations != 0)
1454+      png_do_read_transformations(png_ptr, &row_info);
1455+#endif
1456+
1457+   if (png_ptr->transformed_pixel_depth == 0)
1458+   {
1459+      png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1460+      if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1461+         png_error(png_ptr, "progressive row overflow");
1462+   }
1463+
1464+   png_push_have_row(png_ptr, png_ptr->row_buf + 1);
1465+   png_read_push_finish_row(png_ptr);
1466+
1467+   png_ptr->row_buf = png_ptr->row_buf + png_ptr->rowbytes + 1;
1468+
1469+   // do it again
1470+   if (png_ptr->transformations != 0)
1471+   {
1472+      memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1473+   }
1474+   else
1475+   {
1476+      png_ptr->prev_row = png_ptr->row_buf;
1477+   }
1478+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1479+   if (png_ptr->transformations != 0)
1480+      png_do_read_transformations(png_ptr, &row_info);
1481+#endif
1482+
1483+   png_push_have_row(png_ptr, png_ptr->row_buf + 1);
1484+   png_read_push_finish_row(png_ptr);
1485+}
1486+
1487+static void png_push_process_multi_rows(png_structrp png_ptr, int row_num)
1488+{
1489+   png_debug(1, "in png_push_process_multi_rows");
1490+   uInt row_bytes =  png_ptr->rowbytes + 1;
1491+
1492+   png_row_info row_info;
1493+   row_info.width = png_ptr->iwidth;
1494+   row_info.color_type = png_ptr->color_type;
1495+   row_info.bit_depth = png_ptr->bit_depth;
1496+   row_info.channels = png_ptr->channels;
1497+   row_info.pixel_depth = png_ptr->pixel_depth;
1498+   row_info.rowbytes = png_ptr->rowbytes;
1499+
1500+   png_bytep temp_row = png_ptr->row_buf;
1501+   png_bytep temp_prev_row = png_ptr->prev_row;
1502+
1503+   for (int i = 0; i < row_num; i++) {
1504+      // check if the x2_filter is effective: only supports channels 3 or 4
1505+      if ((png_ptr->channels == 3 || png_ptr->channels == 4) &&
1506+          i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB &&
1507+          png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST &&
1508+          png_ptr->row_buf[0] == png_ptr->row_buf[row_bytes])
1509+      {
1510+         png_push_process_row_x2(png_ptr, row_info);
1511+         png_ptr->row_buf = png_ptr->row_buf + row_bytes;
1512+         i++;
1513+         continue;
1514+      }
1515+      png_push_process_row(png_ptr);
1516+      png_ptr->row_buf = png_ptr->row_buf + row_bytes;
1517+   }
1518+
1519+   if (png_ptr->transformations == 0 && png_ptr->interlaced == 0)
1520+   {
1521+      png_ptr->prev_row = temp_prev_row;
1522+      memcpy(png_ptr->prev_row, png_ptr->row_buf - row_bytes, row_bytes);
1523+   }
1524+   png_ptr->row_buf = temp_row;
1525+}
1526+#endif
1527+
1528 void /* PRIVATE */
1529 png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1530     size_t buffer_length)
1531@@ -639,6 +738,17 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1532    /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */
1533    png_ptr->zstream.avail_in = (uInt)buffer_length;
1534
1535+#ifdef PNG_MULTY_LINE_ENABLE
1536+   // OH ISSUE: png optimize
1537+   int row_num = 1;
1538+   if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1539+       (png_ptr->transformations & PNG_CHECK) == 0)
1540+   {
1541+      int rest = png_ptr->num_rows - png_ptr->row_number;
1542+      row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
1543+   }
1544+#endif
1545+
1546    /* Keep going until the decompressed data is all processed
1547     * or the stream marked as finished.
1548     */
1549@@ -655,9 +765,20 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1550       if (!(png_ptr->zstream.avail_out > 0))
1551       {
1552          /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */
1553+#ifdef PNG_MULTY_LINE_ENABLE
1554+         // OH ISSUE: png optimize
1555+         if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1556+             (png_ptr->transformations & PNG_CHECK) == 0)
1557+         {
1558+            int rest = png_ptr->num_rows - png_ptr->row_number;
1559+            row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
1560+         }
1561+         png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
1562+             png_ptr->iwidth) + 1) * row_num;
1563+#else
1564          png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
1565              png_ptr->iwidth) + 1);
1566-
1567+#endif
1568          png_ptr->zstream.next_out = png_ptr->row_buf;
1569       }
1570
1571@@ -719,7 +840,12 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1572
1573          /* Do we have a complete row? */
1574          if (png_ptr->zstream.avail_out == 0)
1575+#ifdef PNG_MULTY_LINE_ENABLE
1576+            // OH ISSUE: png optimize
1577+            png_push_process_multi_rows(png_ptr, row_num);
1578+#else
1579             png_push_process_row(png_ptr);
1580+#endif
1581       }
1582
1583       /* And check for the end of the stream. */
1584@@ -738,6 +864,7 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1585 void /* PRIVATE */
1586 png_push_process_row(png_structrp png_ptr)
1587 {
1588+   png_debug(1, "in png_push_process_row");
1589    /* 1.5.6: row_info moved out of png_struct to a local here. */
1590    png_row_info row_info;
1591
1592@@ -762,8 +889,17 @@ png_push_process_row(png_structrp png_ptr)
1593     * it may not be in the future, so this was changed just to copy the
1594     * interlaced row count:
1595     */
1596-   memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1597-
1598+#ifdef PNG_MULTY_LINE_ENABLE
1599+   // OH ISSUE: png optimize
1600+   if (png_ptr->transformations == 0 && png_ptr->interlaced == 0)
1601+   {
1602+      png_ptr->prev_row = png_ptr->row_buf;
1603+   }
1604+   else
1605+#endif
1606+   {
1607+      memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1608+   }
1609 #ifdef PNG_READ_TRANSFORMS_SUPPORTED
1610    if (png_ptr->transformations != 0)
1611       png_do_read_transformations(png_ptr, &row_info);
1612diff --git a/pngpriv.h b/pngpriv.h
1613index fb521cf00..6027d9acc 100644
1614--- a/pngpriv.h
1615+++ b/pngpriv.h
1616@@ -189,6 +189,20 @@
1617 #     define PNG_ARM_NEON_IMPLEMENTATION 0
1618 #endif /* PNG_ARM_NEON_OPT > 0 */
1619
1620+#if defined(PNG_ARM_NEON_IMPLEMENTATION) && defined(PNG_ARM_NEON)
1621+// OH ISSUE: png optimize
1622+#  if PNG_ARM_NEON_IMPLEMENTATION == 1
1623+#    define PNG_MULTY_LINE_ENABLE
1624+#    define PNG_WRITE_NEON_ENABLE
1625+#    define PNG_INFLATE_MAX_SIZE (65536)
1626+#    define PNG_INFLATE_ROWS (50)
1627+#    define PNG_CHECK (PNG_EXPAND | PNG_STRIP_ALPHA | PNG_RGB_TO_GRAY | PNG_ENCODE_ALPHA | \
1628+       PNG_PACKSWAP | PNG_GRAY_TO_RGB | PNG_COMPOSE | PNG_SCALE_16_TO_8 | PNG_16_TO_8 | \
1629+       PNG_BACKGROUND_EXPAND | PNG_EXPAND_16 | PNG_PACK | PNG_ADD_ALPHA | PNG_EXPAND_tRNS | \
1630+       PNG_RGB_TO_GRAY_ERR | PNG_RGB_TO_GRAY_WARN | PNG_FILLER | PNG_USER_TRANSFORM)
1631+#  endif
1632+#endif
1633+
1634 #ifndef PNG_MIPS_MSA_OPT
1635 #  if defined(__mips_msa) && (__mips_isa_rev >= 5) && defined(PNG_ALIGNED_MEMORY_SUPPORTED)
1636 #     define PNG_MIPS_MSA_OPT 2
1637@@ -351,8 +365,14 @@
1638 #endif
1639
1640 #ifndef PNG_INTERNAL_FUNCTION
1641+// OH ISSUE: png optimize
1642+#  ifdef PNG_MULTY_LINE_ENABLE
1643+#    define PNG_HIDE __attribute__((visibility("hidden")))
1644+#  else
1645+#    define PNG_HIDE
1646+#  endif
1647 #  define PNG_INTERNAL_FUNCTION(type, name, args, attributes)\
1648-      PNG_LINKAGE_FUNCTION PNG_FUNCTION(type, name, args, PNG_EMPTY attributes)
1649+      PNG_LINKAGE_FUNCTION PNG_FUNCTION(type, name, args, PNG_HIDE attributes)
1650 #endif
1651
1652 #ifndef PNG_INTERNAL_CALLBACK
1653@@ -1304,6 +1324,50 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_neon,(png_row_infop
1654     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1655 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_neon,(png_row_infop
1656     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1657+#ifdef PNG_MULTY_LINE_ENABLE
1658+// OH ISSUE: png optimize
1659+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_up_x2_neon, (png_row_infop
1660+    row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
1661+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_avg3_x2_neon, (png_row_infop
1662+    row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
1663+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_avg4_x2_neon, (png_row_infop
1664+    row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
1665+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_paeth3_x2_neon, (png_row_infop
1666+    row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
1667+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_paeth4_x2_neon, (png_row_infop
1668+    row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
1669+#endif
1670+#ifdef PNG_WRITE_NEON_ENABLE
1671+// OH ISSUE: png optimize
1672+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_sub3_neon, (png_structrp
1673+    png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
1674+PNG_INTERNAL_FUNCTION(void, png_write_filter_sub3_neon_only, (png_structrp
1675+    png_ptr, size_t row_bytes), PNG_EMPTY);
1676+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_sub4_neon, (png_structrp
1677+    png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
1678+PNG_INTERNAL_FUNCTION(void, png_write_filter_sub4_neon_only, (png_structrp
1679+    png_ptr, size_t row_bytes), PNG_EMPTY);
1680+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_up_neon, (png_structrp
1681+    png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
1682+PNG_INTERNAL_FUNCTION(void, png_write_filter_up_neon_only, (png_structrp
1683+    png_ptr, size_t row_bytes), PNG_EMPTY);
1684+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_avg3_neon, (png_structrp
1685+    png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
1686+PNG_INTERNAL_FUNCTION(void, png_write_filter_avg3_neon_only, (png_structrp
1687+    png_ptr, size_t row_bytes), PNG_EMPTY);
1688+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_avg4_neon, (png_structrp
1689+    png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
1690+PNG_INTERNAL_FUNCTION(void, png_write_filter_avg4_neon_only, (png_structrp
1691+    png_ptr, size_t row_bytes), PNG_EMPTY);
1692+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_paeth3_neon, (png_structrp
1693+    png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
1694+PNG_INTERNAL_FUNCTION(void, png_write_filter_paeth3_neon_only, (png_structrp
1695+    png_ptr, size_t row_bytes), PNG_EMPTY);
1696+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_paeth4_neon, (png_structrp
1697+    png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
1698+PNG_INTERNAL_FUNCTION(void, png_write_filter_paeth4_neon_only, (png_structrp
1699+    png_ptr, size_t row_bytes), PNG_EMPTY);
1700+#endif
1701 #endif
1702
1703 #if PNG_MIPS_MSA_OPT > 0
1704diff --git a/pngread.c b/pngread.c
1705index 8fa7d9f16..71be1a26c 100644
1706--- a/pngread.c
1707+++ b/pngread.c
1708@@ -54,7 +54,12 @@ png_create_read_struct_2,(png_const_charp user_png_ver, png_voidp error_ptr,
1709        * required (it will be zero in a write structure.)
1710        */
1711 #     ifdef PNG_SEQUENTIAL_READ_SUPPORTED
1712+#ifdef PNG_MULTY_LINE_ENABLE
1713+         // OH ISSUE: png optimize
1714+         png_ptr->IDAT_read_size = PNG_INFLATE_MAX_SIZE;
1715+#else
1716          png_ptr->IDAT_read_size = PNG_IDAT_READ_SIZE;
1717+#endif
1718 #     endif
1719
1720 #     ifdef PNG_BENIGN_READ_ERRORS_SUPPORTED
1721@@ -684,6 +689,224 @@ png_read_rows(png_structrp png_ptr, png_bytepp row,
1722 #endif /* SEQUENTIAL_READ */
1723
1724 #ifdef PNG_SEQUENTIAL_READ_SUPPORTED
1725+
1726+#ifdef PNG_MULTY_LINE_ENABLE
1727+// OH ISSUE: png optimize
1728+static void png_read_two_rows(png_structrp png_ptr, png_bytepp rows, png_uint_32 i,
1729+                         png_row_info row_info)
1730+{
1731+   png_debug1(1, "in png_read_two_rows %d", png_ptr->row_buf[0]);
1732+   png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
1733+      png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4);
1734+
1735+#ifdef PNG_MNG_FEATURES_SUPPORTED
1736+   if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
1737+      (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
1738+   {
1739+      /* Intrapixel differencing */
1740+      png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
1741+   }
1742+#endif
1743+
1744+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1745+   if (png_ptr->transformations
1746+#       ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
1747+         || png_ptr->num_palette_max >= 0
1748+#       endif
1749+      )
1750+      png_do_read_transformations(png_ptr, &row_info);
1751+#endif
1752+
1753+   /* The transformed pixel depth should match the depth now in row_info. */
1754+   if (png_ptr->transformed_pixel_depth == 0)
1755+   {
1756+      png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1757+      if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1758+         png_error(png_ptr, "sequential row overflow");
1759+   }
1760+
1761+   else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
1762+      png_error(png_ptr, "internal sequential row size calculation error");
1763+
1764+   if (rows[i] != NULL)
1765+      png_combine_row(png_ptr, rows[i], -1);
1766+
1767+   png_read_finish_row(png_ptr);
1768+
1769+   if (png_ptr->read_row_fn != NULL)
1770+      (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
1771+
1772+   png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1;
1773+
1774+   // do again next line
1775+   memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1776+
1777+#ifdef PNG_MNG_FEATURES_SUPPORTED
1778+   if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
1779+      (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
1780+   {
1781+      /* Intrapixel differencing */
1782+      png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
1783+   }
1784+#endif
1785+
1786+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1787+   if (png_ptr->transformations
1788+#       ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
1789+         || png_ptr->num_palette_max >= 0
1790+#       endif
1791+      )
1792+      png_do_read_transformations(png_ptr, &row_info);
1793+#endif
1794+
1795+   /* The transformed pixel depth should match the depth now in row_info. */
1796+   if (png_ptr->transformed_pixel_depth == 0)
1797+   {
1798+      png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1799+      if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1800+         png_error(png_ptr, "sequential row overflow");
1801+   }
1802+
1803+   else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
1804+      png_error(png_ptr, "internal sequential row size calculation error");
1805+
1806+   if (rows[i+1] != NULL)
1807+      png_combine_row(png_ptr, rows[i+1], -1);
1808+
1809+   png_read_finish_row(png_ptr);
1810+
1811+   if (png_ptr->read_row_fn != NULL)
1812+      (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
1813+
1814+   png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1;
1815+}
1816+
1817+static void png_read_muilty_rows(png_structrp png_ptr, png_bytepp rows,
1818+   png_uint_32 row_num, png_row_info row_info_in)
1819+{
1820+   if (png_ptr == NULL)
1821+      return;
1822+
1823+   png_debug2(1, "in png_read_muilty_rows (row %lu, pass %d)",
1824+       (unsigned long)png_ptr->row_number, png_ptr->pass);
1825+
1826+   if ((png_ptr->mode & PNG_HAVE_IDAT) == 0)
1827+         png_error(png_ptr, "Invalid attempt to read row data");
1828+
1829+   /* Fill the row with IDAT data: */
1830+   uInt row_bytes =  row_info_in.rowbytes;
1831+   png_ptr->row_buf[0]=255; /* 255 to force error if no data was found */
1832+   png_read_IDAT_data(png_ptr, png_ptr->row_buf, (row_bytes + 1) * row_num);
1833+   png_bytep temp_row = png_ptr->row_buf;
1834+
1835+   for (png_uint_32 i = 0; i < row_num; i++) {
1836+      png_row_info row_info = row_info_in;
1837+      // check if the x2_filter is effective: only supports channels 3 or 4
1838+      if ((row_info_in.channels == 3 || row_info_in.channels == 4) &&
1839+          i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB &&
1840+          png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST &&
1841+          png_ptr->row_buf[0] == png_ptr->row_buf[row_info_in.rowbytes + 1])
1842+      {
1843+         png_read_two_rows(png_ptr, rows, i, row_info);
1844+         i++;
1845+         continue;
1846+      }
1847+      if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE)
1848+      {
1849+         if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST)
1850+            png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
1851+               png_ptr->prev_row + 1, png_ptr->row_buf[0]);
1852+         else
1853+            png_debug1(1, "bad adaptive filter value %d", png_ptr->row_buf[0]);
1854+      }
1855+
1856+      memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info_in.rowbytes + 1);
1857+
1858+#ifdef PNG_MNG_FEATURES_SUPPORTED
1859+      if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
1860+         (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
1861+      {
1862+         /* Intrapixel differencing */
1863+         png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
1864+      }
1865+#endif
1866+
1867+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1868+      if (png_ptr->transformations
1869+#        ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
1870+            || png_ptr->num_palette_max >= 0
1871+#        endif
1872+         )
1873+         png_do_read_transformations(png_ptr, &row_info);
1874+#endif
1875+
1876+      /* The transformed pixel depth should match the depth now in row_info. */
1877+      if (png_ptr->transformed_pixel_depth == 0)
1878+      {
1879+         png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1880+         if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1881+            png_error(png_ptr, "sequential row overflow");
1882+      }
1883+
1884+      else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
1885+         png_error(png_ptr, "internal sequential row size calculation error");
1886+
1887+      if (rows[i] != NULL)
1888+         png_combine_row(png_ptr, rows[i], -1);
1889+
1890+      png_read_finish_row(png_ptr);
1891+
1892+      if (png_ptr->read_row_fn != NULL)
1893+         (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
1894+
1895+      png_ptr->row_buf = png_ptr->row_buf + row_bytes + 1;
1896+   }
1897+   png_ptr->row_buf = temp_row;
1898+}
1899+
1900+static void png_warn_check(png_structrp png_ptr)
1901+{
1902+#ifdef PNG_WARNINGS_SUPPORTED
1903+   /* Check for transforms that have been set but were defined out */
1904+#if defined(PNG_WRITE_INVERT_SUPPORTED) && !defined(PNG_READ_INVERT_SUPPORTED)
1905+   if ((png_ptr->transformations & PNG_INVERT_MONO) != 0)
1906+      png_warning(png_ptr, "PNG_READ_INVERT_SUPPORTED is not defined");
1907+#endif
1908+
1909+#if defined(PNG_WRITE_FILLER_SUPPORTED) && !defined(PNG_READ_FILLER_SUPPORTED)
1910+   if ((png_ptr->transformations & PNG_FILLER) != 0)
1911+      png_warning(png_ptr, "PNG_READ_FILLER_SUPPORTED is not defined");
1912+#endif
1913+
1914+#if defined(PNG_WRITE_PACKSWAP_SUPPORTED) && \
1915+    !defined(PNG_READ_PACKSWAP_SUPPORTED)
1916+   if ((png_ptr->transformations & PNG_PACKSWAP) != 0)
1917+      png_warning(png_ptr, "PNG_READ_PACKSWAP_SUPPORTED is not defined");
1918+#endif
1919+
1920+#if defined(PNG_WRITE_PACK_SUPPORTED) && !defined(PNG_READ_PACK_SUPPORTED)
1921+   if ((png_ptr->transformations & PNG_PACK) != 0)
1922+      png_warning(png_ptr, "PNG_READ_PACK_SUPPORTED is not defined");
1923+#endif
1924+
1925+#if defined(PNG_WRITE_SHIFT_SUPPORTED) && !defined(PNG_READ_SHIFT_SUPPORTED)
1926+   if ((png_ptr->transformations & PNG_SHIFT) != 0)
1927+      png_warning(png_ptr, "PNG_READ_SHIFT_SUPPORTED is not defined");
1928+#endif
1929+
1930+#if defined(PNG_WRITE_BGR_SUPPORTED) && !defined(PNG_READ_BGR_SUPPORTED)
1931+   if ((png_ptr->transformations & PNG_BGR) != 0)
1932+      png_warning(png_ptr, "PNG_READ_BGR_SUPPORTED is not defined");
1933+#endif
1934+
1935+#if defined(PNG_WRITE_SWAP_SUPPORTED) && !defined(PNG_READ_SWAP_SUPPORTED)
1936+   if ((png_ptr->transformations & PNG_SWAP_BYTES) != 0)
1937+      png_warning(png_ptr, "PNG_READ_SWAP_SUPPORTED is not defined");
1938+#endif
1939+#endif /* WARNINGS */
1940+}
1941+#endif // PNG_MULTY_LINE_ENABLE
1942+
1943 /* Read the entire image.  If the image has an alpha channel or a tRNS
1944  * chunk, and you have called png_handle_alpha()[*], you will need to
1945  * initialize the image to the current image that PNG will be overlaying.
1946@@ -745,13 +968,45 @@ png_read_image(png_structrp png_ptr, png_bytepp image)
1947
1948    image_height=png_ptr->height;
1949
1950-   for (j = 0; j < pass; j++)
1951-   {
1952+#ifdef PNG_MULTY_LINE_ENABLE
1953+   // OH ISSUE: png optimize
1954+   if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1955+       (png_ptr->transformations & PNG_CHECK) == 0) {
1956+      if ((png_ptr->flags & PNG_FLAG_ROW_INIT) == 0)
1957+         png_read_start_row(png_ptr);
1958+
1959+      png_warn_check(png_ptr);
1960+      png_row_info row_info;
1961+      row_info.width = png_ptr->iwidth;
1962+      row_info.color_type = png_ptr->color_type;
1963+      row_info.bit_depth = png_ptr->bit_depth;
1964+      row_info.channels = png_ptr->channels;
1965+      row_info.pixel_depth = png_ptr->pixel_depth;
1966+      row_info.rowbytes = png_ptr->rowbytes;
1967+
1968       rp = image;
1969-      for (i = 0; i < image_height; i++)
1970+      int row_num = PNG_INFLATE_ROWS;
1971+      for (i = 0; i < image_height; i += PNG_INFLATE_ROWS)
1972       {
1973-         png_read_row(png_ptr, *rp, NULL);
1974-         rp++;
1975+         if (image_height - i < PNG_INFLATE_ROWS)
1976+         {
1977+            row_num = image_height - i;
1978+         }
1979+         png_read_muilty_rows(png_ptr, rp, row_num, row_info);
1980+         rp += row_num;
1981+      }
1982+   }
1983+   else
1984+#endif
1985+   {
1986+      for (j = 0; j < pass; j++)
1987+      {
1988+         rp = image;
1989+         for (i = 0; i < image_height; i++)
1990+         {
1991+            png_read_row(png_ptr, *rp, NULL);
1992+            rp++;
1993+         }
1994       }
1995    }
1996 }
1997diff --git a/pngrutil.c b/pngrutil.c
1998index 9ac8ec11f..8afdf4fa5 100644
1999--- a/pngrutil.c
2000+++ b/pngrutil.c
2001@@ -4134,7 +4134,12 @@ png_read_filter_row(png_structrp pp, png_row_infop row_info, png_bytep row,
2002     * PNG_FILTER_OPTIMIZATIONS to a function that overrides the generic
2003     * implementations.  See png_init_filter_functions above.
2004     */
2005+#ifdef PNG_MULTY_LINE_ENABLE
2006+   // OH ISSUE: png optimize
2007+   if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST_X2)
2008+#else
2009    if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST)
2010+#endif
2011    {
2012       if (pp->read_filter[0] == NULL)
2013          png_init_filter_functions(pp);
2014@@ -4606,7 +4611,24 @@ defined(PNG_USER_TRANSFORM_PTR_SUPPORTED)
2015              row_bytes + 48);
2016
2017       else
2018+      {
2019+#ifdef PNG_MULTY_LINE_ENABLE
2020+         // OH ISSUE: png optimize
2021+         png_uint_32 row_num = 1;
2022+         if (png_ptr->bit_depth == 8 &&
2023+             (png_ptr->transformations & PNG_CHECK) == 0)
2024+         {
2025+            row_num = png_ptr->height < PNG_INFLATE_ROWS ?
2026+               png_ptr->height : PNG_INFLATE_ROWS;
2027+         }
2028+         png_ptr->big_row_buf = (png_bytep)png_malloc(
2029+            png_ptr, row_bytes * row_num + 48);
2030+         if (png_ptr->big_row_buf == NULL)
2031+            png_error(png_ptr, "png_malloc failed");
2032+#else
2033          png_ptr->big_row_buf = (png_bytep)png_malloc(png_ptr, row_bytes + 48);
2034+#endif
2035+      }
2036
2037       png_ptr->big_prev_row = (png_bytep)png_malloc(png_ptr, row_bytes + 48);
2038
2039diff --git a/pngstruct.h b/pngstruct.h
2040index e591d94d5..7c3846475 100644
2041--- a/pngstruct.h
2042+++ b/pngstruct.h
2043@@ -140,6 +140,14 @@ typedef const png_colorspace * PNG_RESTRICT png_const_colorspacerp;
2044 #define PNG_COLORSPACE_CANCEL(flags)        (0xffff ^ (flags))
2045 #endif /* COLORSPACE || GAMMA */
2046
2047+#ifdef PNG_MULTY_LINE_ENABLE
2048+// OH ISSUE: png optimize
2049+#define PNG_FILTER_VALUE_UP_X2      (6) // PNG_FILTER_VALUE_UP + 4
2050+#define PNG_FILTER_VALUE_AVG_X2     (7) // PNG_FILTER_VALUE_AVG + 4
2051+#define PNG_FILTER_VALUE_PAETH_X2   (8) // PNG_FILTER_VALUE_PAETH + 4
2052+#define PNG_FILTER_VALUE_LAST_X2    (9) // PNG_FILTER_VALUE_LAST + 4
2053+#endif
2054+
2055 struct png_struct_def
2056 {
2057 #ifdef PNG_SETJMP_SUPPORTED
2058@@ -467,8 +475,14 @@ struct png_struct_def
2059    png_bytep big_prev_row;
2060
2061 /* New member added in libpng-1.5.7 */
2062+#ifdef PNG_MULTY_LINE_ENABLE
2063+   // OH ISSUE: png optimize
2064+   void (*read_filter[PNG_FILTER_VALUE_LAST_X2 - 1])(png_row_infop row_info,
2065+      png_bytep row, png_const_bytep prev_row);
2066+#else
2067    void (*read_filter[PNG_FILTER_VALUE_LAST-1])(png_row_infop row_info,
2068       png_bytep row, png_const_bytep prev_row);
2069+#endif
2070
2071 #ifdef PNG_READ_SUPPORTED
2072 #if defined(PNG_COLORSPACE_SUPPORTED) || defined(PNG_GAMMA_SUPPORTED)
2073diff --git a/pngtrans.c b/pngtrans.c
2074index 1100f46eb..99736747a 100644
2075--- a/pngtrans.c
2076+++ b/pngtrans.c
2077@@ -13,6 +13,19 @@
2078
2079 #include "pngpriv.h"
2080
2081+#ifdef PNG_MULTY_LINE_ENABLE
2082+#  if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
2083+#    include <arm64_neon.h>
2084+#  else
2085+#    include <arm_neon.h>
2086+#  endif
2087+#  define STEP_GRAY (16)
2088+#  define STEP_GA (32)
2089+#  define STEP_RGB (48)
2090+#  define STEP_RGBA (64)
2091+#  define INDEX2 (2)
2092+#endif
2093+
2094 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED)
2095
2096 #if defined(PNG_READ_BGR_SUPPORTED) || defined(PNG_WRITE_BGR_SUPPORTED)
2097@@ -269,13 +282,19 @@ png_do_invert(png_row_infop row_info, png_bytep row)
2098    if (row_info->color_type == PNG_COLOR_TYPE_GRAY)
2099    {
2100       png_bytep rp = row;
2101-      size_t i;
2102-      size_t istop = row_info->rowbytes;
2103-
2104-      for (i = 0; i < istop; i++)
2105+      png_bytep rp_stop = row + row_info->rowbytes;
2106+#ifdef PNG_MULTY_LINE_ENABLE
2107+      png_bytep rp_stop_neon = rp_stop - STEP_GRAY;
2108+      for (; rp < rp_stop_neon; rp += STEP_GRAY)
2109+      {
2110+         uint8x16_t gray = vld1q_u8(rp);
2111+         gray = ~gray;
2112+         vst1q_u8(rp, gray);
2113+      }
2114+#endif
2115+      for (; rp < rp_stop; rp++)
2116       {
2117          *rp = (png_byte)(~(*rp));
2118-         rp++;
2119       }
2120    }
2121
2122@@ -283,13 +302,19 @@ png_do_invert(png_row_infop row_info, png_bytep row)
2123       row_info->bit_depth == 8)
2124    {
2125       png_bytep rp = row;
2126-      size_t i;
2127-      size_t istop = row_info->rowbytes;
2128-
2129-      for (i = 0; i < istop; i += 2)
2130+      png_bytep rp_stop = row + row_info->rowbytes;
2131+#ifdef PNG_MULTY_LINE_ENABLE
2132+      png_bytep rp_stop_neon = rp_stop - STEP_GA;
2133+      for (; rp < rp_stop_neon; rp += STEP_GA)
2134+      {
2135+         uint8x16x2_t gray_alpha = vld2q_u8(rp);
2136+         gray_alpha.val[0] = ~gray_alpha.val[0];
2137+         vst2q_u8(rp, gray_alpha);
2138+      }
2139+#endif
2140+      for (; rp < rp_stop; rp += 2)
2141       {
2142          *rp = (png_byte)(~(*rp));
2143-         rp += 2;
2144       }
2145    }
2146
2147@@ -298,14 +323,21 @@ png_do_invert(png_row_infop row_info, png_bytep row)
2148       row_info->bit_depth == 16)
2149    {
2150       png_bytep rp = row;
2151-      size_t i;
2152-      size_t istop = row_info->rowbytes;
2153-
2154-      for (i = 0; i < istop; i += 4)
2155+      png_bytep rp_stop = row + row_info->rowbytes;
2156+#ifdef PNG_MULTY_LINE_ENABLE
2157+      png_bytep rp_stop_neon = rp_stop - STEP_RGBA;
2158+      for (; rp < rp_stop_neon; rp += STEP_RGBA)
2159+      {
2160+         uint8x16x4_t gray_alpha = vld4q_u8(rp);
2161+         gray_alpha.val[0] = ~gray_alpha.val[0];
2162+         gray_alpha.val[1] = ~gray_alpha.val[1];
2163+         vst4q_u8(rp, gray_alpha);
2164+      }
2165+#endif
2166+      for (; rp < rp_stop; rp += 4)
2167       {
2168          *rp = (png_byte)(~(*rp));
2169          *(rp + 1) = (png_byte)(~(*(rp + 1)));
2170-         rp += 4;
2171       }
2172    }
2173 #endif
2174@@ -323,10 +355,19 @@ png_do_swap(png_row_infop row_info, png_bytep row)
2175    if (row_info->bit_depth == 16)
2176    {
2177       png_bytep rp = row;
2178-      png_uint_32 i;
2179-      png_uint_32 istop= row_info->width * row_info->channels;
2180-
2181-      for (i = 0; i < istop; i++, rp += 2)
2182+      png_bytep rp_stop = row + row_info->rowbytes;
2183+#ifdef PNG_MULTY_LINE_ENABLE
2184+      png_bytep rp_stop_neon = rp_stop - STEP_GA;
2185+      for (; rp < rp_stop_neon; rp += STEP_GA)
2186+      {
2187+         uint8x16x2_t gray = vld2q_u8(rp);
2188+         uint8x16_t tmp = gray.val[0];
2189+         gray.val[0] = gray.val[1];
2190+         gray.val[1] = tmp;
2191+         vst2q_u8(rp, gray);
2192+      }
2193+#endif
2194+      for (; rp < rp_stop; rp += 2)
2195       {
2196 #ifdef PNG_BUILTIN_BSWAP16_SUPPORTED
2197          /* Feature added to libpng-1.6.11 for testing purposes, not
2198@@ -622,15 +663,24 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2199
2200    if ((row_info->color_type & PNG_COLOR_MASK_COLOR) != 0)
2201    {
2202-      png_uint_32 row_width = row_info->width;
2203       if (row_info->bit_depth == 8)
2204       {
2205          if (row_info->color_type == PNG_COLOR_TYPE_RGB)
2206          {
2207-            png_bytep rp;
2208-            png_uint_32 i;
2209-
2210-            for (i = 0, rp = row; i < row_width; i++, rp += 3)
2211+            png_bytep rp = row;
2212+            png_bytep rp_stop = row + row_info->rowbytes;
2213+#ifdef PNG_MULTY_LINE_ENABLE
2214+            png_bytep rp_stop_neon = rp_stop - STEP_RGB;
2215+            for (; rp < rp_stop_neon; rp += STEP_RGB)
2216+            {
2217+               uint8x16x3_t bgr = vld3q_u8(rp);
2218+               uint8x16_t tmp = bgr.val[INDEX2];
2219+               bgr.val[INDEX2] = bgr.val[0];
2220+               bgr.val[0] = tmp;
2221+               vst3q_u8(rp, bgr);
2222+            }
2223+#endif
2224+            for (; rp < rp_stop; rp += 3)
2225             {
2226                png_byte save = *rp;
2227                *rp = *(rp + 2);
2228@@ -640,10 +690,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2229
2230          else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA)
2231          {
2232-            png_bytep rp;
2233-            png_uint_32 i;
2234-
2235-            for (i = 0, rp = row; i < row_width; i++, rp += 4)
2236+            png_bytep rp = row;
2237+            png_bytep rp_stop = row + row_info->rowbytes;
2238+#ifdef PNG_MULTY_LINE_ENABLE
2239+            png_bytep rp_stop_neon = rp_stop - STEP_RGBA;
2240+            for (; rp < rp_stop_neon; rp += STEP_RGBA)
2241+            {
2242+               uint8x16x4_t bgra = vld4q_u8(rp);
2243+               uint8x16_t tmp = bgra.val[INDEX2];
2244+               bgra.val[INDEX2] = bgra.val[0];
2245+               bgra.val[0] = tmp;
2246+               vst4q_u8(rp, bgra);
2247+            }
2248+#endif
2249+            for (; rp < rp_stop; rp += 4)
2250             {
2251                png_byte save = *rp;
2252                *rp = *(rp + 2);
2253@@ -657,10 +717,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2254       {
2255          if (row_info->color_type == PNG_COLOR_TYPE_RGB)
2256          {
2257-            png_bytep rp;
2258-            png_uint_32 i;
2259-
2260-            for (i = 0, rp = row; i < row_width; i++, rp += 6)
2261+            png_bytep rp = row;
2262+            png_bytep rp_stop = row + row_info->rowbytes;
2263+#ifdef PNG_MULTY_LINE_ENABLE
2264+            png_bytep rp_stop_neon = rp_stop - STEP_RGB;
2265+            for (; rp < rp_stop_neon; rp += STEP_RGB)
2266+            {
2267+               uint16x8x3_t bgr = vld3q_u16((unsigned short *)rp);
2268+               uint16x8_t tmp = bgr.val[INDEX2];
2269+               bgr.val[INDEX2] = bgr.val[0];
2270+               bgr.val[0] = tmp;
2271+               vst3q_u16((unsigned short *)rp, bgr);
2272+            }
2273+#endif
2274+            for (; rp < rp_stop; rp += 6)
2275             {
2276                png_byte save = *rp;
2277                *rp = *(rp + 4);
2278@@ -673,10 +743,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2279
2280          else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA)
2281          {
2282-            png_bytep rp;
2283-            png_uint_32 i;
2284-
2285-            for (i = 0, rp = row; i < row_width; i++, rp += 8)
2286+            png_bytep rp = row;
2287+            png_bytep rp_stop = row + row_info->rowbytes;
2288+#ifdef PNG_MULTY_LINE_ENABLE
2289+            png_bytep rp_stop_neon = rp_stop - STEP_RGBA;
2290+            for (; rp < rp_stop_neon; rp += STEP_RGBA)
2291+            {
2292+               uint16x8x4_t bgra = vld4q_u16((unsigned short *)rp);
2293+               uint16x8_t tmp = bgra.val[INDEX2];
2294+               bgra.val[INDEX2] = bgra.val[0];
2295+               bgra.val[0] = tmp;
2296+               vst4q_u16((unsigned short *)rp, bgra);
2297+            }
2298+#endif
2299+            for (; rp < rp_stop; rp += 8)
2300             {
2301                png_byte save = *rp;
2302                *rp = *(rp + 4);
2303diff --git a/pngwutil.c b/pngwutil.c
2304index 16345e4c0..212c090b6 100644
2305--- a/pngwutil.c
2306+++ b/pngwutil.c
2307@@ -16,6 +16,20 @@
2308 #ifdef PNG_WRITE_SUPPORTED
2309
2310 #ifdef PNG_WRITE_INT_FUNCTIONS_SUPPORTED
2311+#ifdef PNG_WRITE_NEON_ENABLE
2312+// OH ISSUE: png optimize
2313+#  if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
2314+#    include <arm64_neon.h>
2315+#  else
2316+#    include <arm_neon.h>
2317+#  endif
2318+#  define STEP 16
2319+#  define MID 128
2320+#  define SHIFT_RGB 13
2321+#  define SHIFT_RGBA 12
2322+#  define BYTE_RGB 3
2323+#  define BYTE_RGBA 4
2324+#endif
2325 /* Place a 32-bit number into a buffer in PNG byte order.  We work
2326  * with unsigned numbers for convenience, although one supported
2327  * ancillary chunk uses signed (two's complement) numbers.
2328@@ -2275,10 +2289,939 @@ png_write_filtered_row(png_structrp png_ptr, png_bytep filtered_row,
2329     size_t row_bytes);
2330
2331 #ifdef PNG_WRITE_FILTER_SUPPORTED
2332+#ifdef PNG_WRITE_NEON_ENABLE
2333+size_t png_write_filter_sub3_neon(png_structrp png_ptr,
2334+    size_t row_bytes, size_t lmins)
2335+{
2336+   size_t sum = 0;
2337+   png_bytep rp = png_ptr->row_buf + 1;
2338+   png_bytep dp = png_ptr->try_row + 1;
2339+   png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB;
2340+
2341+   size_t count = row_bytes;
2342+   uint8x16_t tmp = vdupq_n_u8(0);
2343+   while (count >= STEP)
2344+   {
2345+      uint8x16_t qrp = vld1q_u8(rp);
2346+      uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB);
2347+      uint8x16_t qdp = vsubq_u8(qrp, qlp);
2348+      vst1q_u8(dp, qdp);
2349+      tmp = qrp;
2350+      int8x16_t v_s = vreinterpretq_s8_u8(qdp);
2351+      v_s = vabsq_s8(v_s);
2352+      uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
2353+      sum += vaddlvq_u8(v_u);
2354+      rp += STEP;
2355+      dp += STEP;
2356+      count -= STEP;
2357+      if (sum > lmins)
2358+      {
2359+         return sum;
2360+      }
2361+   }
2362+
2363+   if (count == row_bytes)
2364+   {
2365+      dp[0] = rp[0];
2366+      dp[1] = rp[1];
2367+      dp[2] = rp[2];
2368+      sum += MID - abs((int)dp[0] - MID);
2369+      sum += MID - abs((int)dp[1] - MID);
2370+      sum += MID - abs((int)dp[2] - MID);
2371+      rp += BYTE_RGB;
2372+      dp += BYTE_RGB;
2373+      count -= BYTE_RGB;
2374+   }
2375+
2376+   png_bytep lp = rp - BYTE_RGB;
2377+   while (count > 0)
2378+   {
2379+      *dp = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff);
2380+      sum += MID - abs((int)*dp++ - MID);
2381+      count--;
2382+      if (sum > lmins)
2383+      {
2384+         return sum;
2385+      }
2386+   }
2387+   return sum;
2388+}
2389+
2390+void png_write_filter_sub3_neon_only(png_structrp png_ptr,
2391+    size_t row_bytes)
2392+{
2393+   png_bytep rp = png_ptr->row_buf + 1;
2394+   png_bytep dp = png_ptr->try_row + 1;
2395+   png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB;
2396+
2397+   size_t count = row_bytes;
2398+   uint8x16_t tmp = vdupq_n_u8(0);
2399+   while (count >= STEP)
2400+   {
2401+      uint8x16_t qrp = vld1q_u8(rp);
2402+      uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB);
2403+      uint8x16_t qdp = vsubq_u8(qrp, qlp);
2404+      vst1q_u8(dp, qdp);
2405+      tmp = qrp;
2406+      rp += STEP;
2407+      dp += STEP;
2408+      count -= STEP;
2409+   }
2410+
2411+   if (count == row_bytes)
2412+   {
2413+      dp[0] = rp[0];
2414+      dp[1] = rp[1];
2415+      dp[2] = rp[2];
2416+      rp += BYTE_RGB;
2417+      dp += BYTE_RGB;
2418+      count -= BYTE_RGB;
2419+   }
2420+
2421+   png_bytep lp = rp - BYTE_RGB;
2422+   while (count > 0)
2423+   {
2424+      *dp++ = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff);
2425+      count--;
2426+   }
2427+}
2428+
2429+size_t png_write_filter_sub4_neon(png_structrp png_ptr,
2430+    size_t row_bytes, size_t lmins)
2431+{
2432+   size_t sum = 0;
2433+   png_bytep rp = png_ptr->row_buf + 1;
2434+   png_bytep dp = png_ptr->try_row + 1;
2435+   png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB;
2436+
2437+   size_t count = row_bytes;
2438+   uint8x16_t tmp = vdupq_n_u8(0);
2439+   while (count >= STEP)
2440+   {
2441+      uint8x16_t qrp = vld1q_u8(rp);
2442+      uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA);
2443+      uint8x16_t qdp = vsubq_u8(qrp, qlp);
2444+      vst1q_u8(dp, qdp);
2445+      tmp = qrp;
2446+      int8x16_t v_s = vreinterpretq_s8_u8(qdp);
2447+      v_s = vabsq_s8(v_s);
2448+      uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
2449+      sum += vaddlvq_u8(v_u);
2450+      rp += STEP;
2451+      dp += STEP;
2452+      count -= STEP;
2453+      if (sum > lmins)
2454+      {
2455+         return sum;
2456+      }
2457+   }
2458+
2459+   if (count == row_bytes)
2460+   {
2461+      dp[0] = rp[0];
2462+      dp[1] = rp[1];
2463+      dp[2] = rp[2];
2464+      dp[3] = rp[3];
2465+      sum += MID - abs((int)dp[0] - MID);
2466+      sum += MID - abs((int)dp[1] - MID);
2467+      sum += MID - abs((int)dp[2] - MID);
2468+      sum += MID - abs((int)dp[3] - MID);
2469+      rp += BYTE_RGBA;
2470+      dp += BYTE_RGBA;
2471+      count -= BYTE_RGBA;
2472+   }
2473+
2474+   png_bytep lp = rp - BYTE_RGBA;
2475+   while (count > 0)
2476+   {
2477+      *dp = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff);
2478+      sum += MID - abs((int)*dp++ - MID);
2479+      count--;
2480+      if (sum > lmins)
2481+      {
2482+         return sum;
2483+      }
2484+   }
2485+   return sum;
2486+}
2487+
2488+void png_write_filter_sub4_neon_only(png_structrp png_ptr,
2489+    size_t row_bytes)
2490+{
2491+   png_bytep rp = png_ptr->row_buf + 1;
2492+   png_bytep dp = png_ptr->try_row + 1;
2493+   png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB;
2494+
2495+   size_t count = row_bytes;
2496+   uint8x16_t tmp = vdupq_n_u8(0);
2497+   while (count >= STEP)
2498+   {
2499+      uint8x16_t qrp = vld1q_u8(rp);
2500+      uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA);
2501+      uint8x16_t qdp = vsubq_u8(qrp, qlp);
2502+      vst1q_u8(dp, qdp);
2503+      tmp = qrp;
2504+      rp += STEP;
2505+      dp += STEP;
2506+      count -= STEP;
2507+   }
2508+
2509+   if (count == row_bytes)
2510+   {
2511+      dp[0] = rp[0];
2512+      dp[1] = rp[1];
2513+      dp[2] = rp[2];
2514+      dp[3] = rp[3];
2515+      rp += BYTE_RGBA;
2516+      dp += BYTE_RGBA;
2517+      count -= BYTE_RGBA;
2518+   }
2519+
2520+   png_bytep lp = rp - BYTE_RGBA;
2521+   while (count > 0)
2522+   {
2523+      *dp++ = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff);
2524+      count--;
2525+   }
2526+}
2527+
2528+size_t png_write_filter_up_neon(png_structrp png_ptr,
2529+    size_t row_bytes, size_t lmins)
2530+{
2531+   size_t sum = 0;
2532+   png_bytep rp = png_ptr->row_buf + 1;
2533+   png_bytep dp = png_ptr->try_row + 1;
2534+   png_bytep pp = png_ptr->prev_row + 1;
2535+   png_ptr->try_row[0] = PNG_FILTER_VALUE_UP;
2536+
2537+   size_t count = row_bytes;
2538+   while (count >= STEP)
2539+   {
2540+      uint8x16_t qrp = vld1q_u8(rp);
2541+      uint8x16_t qpp = vld1q_u8(pp);
2542+      uint8x16_t qdp = vsubq_u8(qrp, qpp);
2543+      vst1q_u8(dp, qdp);
2544+      int8x16_t v_s = vreinterpretq_s8_u8(qdp);
2545+      v_s = vabsq_s8(v_s);
2546+      uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
2547+      sum += vaddlvq_u8(v_u);
2548+      rp += STEP;
2549+      pp += STEP;
2550+      dp += STEP;
2551+      count -= STEP;
2552+      if (sum > lmins)
2553+      {
2554+         return sum;
2555+      }
2556+   }
2557+
2558+   while (count > 0)
2559+   {
2560+      *dp = (png_byte)(((int)*rp++ - (int)*pp++) & 0xff);
2561+      sum += MID - abs((int)*dp++ - MID);
2562+      count--;
2563+      if (sum > lmins)
2564+      {
2565+         return sum;
2566+      }
2567+   }
2568+   return sum;
2569+}
2570+
2571+void png_write_filter_up_neon_only(png_structrp png_ptr, size_t row_bytes)
2572+{
2573+   png_bytep rp = png_ptr->row_buf + 1;
2574+   png_bytep dp = png_ptr->try_row + 1;
2575+   png_bytep pp = png_ptr->prev_row + 1;
2576+   png_ptr->try_row[0] = PNG_FILTER_VALUE_UP;
2577+
2578+   size_t count = row_bytes;
2579+   while (count >= STEP)
2580+   {
2581+      uint8x16_t qrp = vld1q_u8(rp);
2582+      uint8x16_t qpp = vld1q_u8(pp);
2583+      uint8x16_t qdp = vsubq_u8(qrp, qpp);
2584+      vst1q_u8(dp, qdp);
2585+      rp += STEP;
2586+      pp += STEP;
2587+      dp += STEP;
2588+      count -= STEP;
2589+   }
2590+
2591+   while (count > 0)
2592+   {
2593+      *dp++ = (png_byte)(((int)*rp++ - (int)*pp++) & 0xff);
2594+      count--;
2595+   }
2596+}
2597+
2598+size_t png_write_filter_avg3_neon(png_structrp png_ptr,
2599+    size_t row_bytes, size_t lmins)
2600+{
2601+   size_t sum = 0;
2602+   png_bytep rp = png_ptr->row_buf + 1;
2603+   png_bytep pp = png_ptr->prev_row + 1;
2604+   png_bytep dp = png_ptr->try_row + 1;
2605+   png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG;
2606+
2607+   size_t count = row_bytes;
2608+   uint8x16_t tmp = vdupq_n_u8(0);
2609+   while (count >= STEP)
2610+   {
2611+      uint8x16_t qrp = vld1q_u8(rp);
2612+      uint8x16_t qpp = vld1q_u8(pp);
2613+      uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB);
2614+      qlp = vhaddq_u8(qpp, qlp);
2615+      uint8x16_t qdp = vsubq_u8(qrp, qlp);
2616+      vst1q_u8(dp, qdp);
2617+      int8x16_t v_s = vreinterpretq_s8_u8(qdp);
2618+      v_s = vabsq_s8(v_s);
2619+      uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
2620+      sum += vaddlvq_u8(v_u);
2621+      tmp = qrp;
2622+      rp += STEP;
2623+      pp += STEP;
2624+      dp += STEP;
2625+      count -= STEP;
2626+      if (sum > lmins)
2627+      {
2628+         return sum;
2629+      }
2630+   }
2631+
2632+   if (count == row_bytes)
2633+   {
2634+      dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff);
2635+      dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff);
2636+      dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff);
2637+      sum += MID - abs((int)dp[0] - MID);
2638+      sum += MID - abs((int)dp[1] - MID);
2639+      sum += MID - abs((int)dp[2] - MID);
2640+      rp += BYTE_RGB;
2641+      pp += BYTE_RGB;
2642+      dp += BYTE_RGB;
2643+      count -= BYTE_RGB;
2644+   }
2645+
2646+   png_bytep lp = rp - BYTE_RGB;
2647+   while (count > 0)
2648+   {
2649+      *dp = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff);
2650+      count--;
2651+      sum += MID - abs((int)*dp++ - MID);
2652+      if (sum > lmins)
2653+      {
2654+         return sum;
2655+      }
2656+   }
2657+   return sum;
2658+}
2659+
2660+void png_write_filter_avg3_neon_only(png_structrp png_ptr,
2661+    size_t row_bytes)
2662+{
2663+   png_bytep rp = png_ptr->row_buf + 1;
2664+   png_bytep pp = png_ptr->prev_row + 1;
2665+   png_bytep dp = png_ptr->try_row + 1;
2666+   png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG;
2667+
2668+   size_t count = row_bytes;
2669+   uint8x16_t tmp = vdupq_n_u8(0);
2670+   while (count >= STEP)
2671+   {
2672+      uint8x16_t qrp = vld1q_u8(rp);
2673+      uint8x16_t qpp = vld1q_u8(pp);
2674+      uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB);
2675+      qlp = vhaddq_u8(qpp, qlp);
2676+      uint8x16_t qdp = vsubq_u8(qrp, qlp);
2677+      vst1q_u8(dp, qdp);
2678+      tmp = qrp;
2679+      rp += STEP;
2680+      pp += STEP;
2681+      dp += STEP;
2682+      count -= STEP;
2683+   }
2684+
2685+   if (count == row_bytes)
2686+   {
2687+      dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff);
2688+      dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff);
2689+      dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff);
2690+      rp += BYTE_RGB;
2691+      pp += BYTE_RGB;
2692+      dp += BYTE_RGB;
2693+      count -= BYTE_RGB;
2694+   }
2695+
2696+   png_bytep lp = rp - BYTE_RGB;
2697+   while (count > 0)
2698+   {
2699+      *dp++ = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff);
2700+      count--;
2701+   }
2702+}
2703+
2704+size_t png_write_filter_avg4_neon(png_structrp png_ptr,
2705+    size_t row_bytes, size_t lmins)
2706+{
2707+   size_t sum = 0;
2708+   png_bytep rp = png_ptr->row_buf + 1;
2709+   png_bytep pp = png_ptr->prev_row + 1;
2710+   png_bytep dp = png_ptr->try_row + 1;
2711+   png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG;
2712+
2713+   size_t count = row_bytes;
2714+   uint8x16_t tmp = vdupq_n_u8(0);
2715+   while (count >= STEP)
2716+   {
2717+      uint8x16_t qrp = vld1q_u8(rp);
2718+      uint8x16_t qpp = vld1q_u8(pp);
2719+      uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA);
2720+      qlp = vhaddq_u8(qpp, qlp);
2721+      uint8x16_t qdp = vsubq_u8(qrp, qlp);
2722+      vst1q_u8(dp, qdp);
2723+      int8x16_t v_s = vreinterpretq_s8_u8(qdp);
2724+      v_s = vabsq_s8(v_s);
2725+      uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
2726+      sum += vaddlvq_u8(v_u);
2727+      tmp = qrp;
2728+      rp += STEP;
2729+      pp += STEP;
2730+      dp += STEP;
2731+      count -= STEP;
2732+      if (sum > lmins)
2733+      {
2734+         return sum;
2735+      }
2736+   }
2737+
2738+   if (count == row_bytes)
2739+   {
2740+      dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff);
2741+      dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff);
2742+      dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff);
2743+      dp[3] = (png_byte)(((int)rp[3] - ((int)pp[3] / 2)) & 0xff);
2744+      sum += MID - abs((int)dp[0] - MID);
2745+      sum += MID - abs((int)dp[1] - MID);
2746+      sum += MID - abs((int)dp[2] - MID);
2747+      sum += MID - abs((int)dp[3] - MID);
2748+      rp += BYTE_RGBA;
2749+      pp += BYTE_RGBA;
2750+      dp += BYTE_RGBA;
2751+      count -= BYTE_RGBA;
2752+   }
2753+
2754+   png_bytep lp = rp - BYTE_RGBA;
2755+   while (count > 0)
2756+   {
2757+      *dp = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff);
2758+      count--;
2759+      sum += MID - abs((int)*dp++ - MID);
2760+      if (sum > lmins)
2761+      {
2762+         return sum;
2763+      }
2764+   }
2765+   return sum;
2766+}
2767+
2768+void png_write_filter_avg4_neon_only(png_structrp png_ptr,
2769+    size_t row_bytes)
2770+{
2771+   png_bytep rp = png_ptr->row_buf + 1;
2772+   png_bytep pp = png_ptr->prev_row + 1;
2773+   png_bytep dp = png_ptr->try_row + 1;
2774+   png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG;
2775+
2776+   size_t count = row_bytes;
2777+   uint8x16_t tmp = vdupq_n_u8(0);
2778+   while (count >= STEP)
2779+   {
2780+      uint8x16_t qrp = vld1q_u8(rp);
2781+      uint8x16_t qpp = vld1q_u8(pp);
2782+      uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA);
2783+      qlp = vhaddq_u8(qpp, qlp);
2784+      uint8x16_t qdp = vsubq_u8(qrp, qlp);
2785+      vst1q_u8(dp, qdp);
2786+      tmp = qrp;
2787+      rp += STEP;
2788+      pp += STEP;
2789+      dp += STEP;
2790+      count -= STEP;
2791+   }
2792+
2793+   if (count == row_bytes)
2794+   {
2795+      dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff);
2796+      dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff);
2797+      dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff);
2798+      dp[3] = (png_byte)(((int)rp[3] - ((int)pp[3] / 2)) & 0xff);
2799+      rp += BYTE_RGBA;
2800+      pp += BYTE_RGBA;
2801+      dp += BYTE_RGBA;
2802+      count -= BYTE_RGBA;
2803+   }
2804+
2805+   png_bytep lp = rp - BYTE_RGBA;
2806+   while (count > 0)
2807+   {
2808+      *dp++ = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff);
2809+      count--;
2810+   }
2811+}
2812+
2813+size_t png_write_filter_paeth3_neon(png_structrp png_ptr,
2814+    size_t row_bytes, size_t lmins)
2815+{
2816+   size_t sum = 0;
2817+   png_bytep rp = png_ptr->row_buf + 1;
2818+   png_bytep pp = png_ptr->prev_row + 1;
2819+   png_bytep dp = png_ptr->try_row + 1;
2820+   png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH;
2821+
2822+   size_t count = row_bytes;
2823+   uint8x16_t tmp_a = vdupq_n_u8(0);
2824+   uint8x16_t tmp_c = vdupq_n_u8(0);
2825+   while (count >= STEP)
2826+   {
2827+      uint8x16_t qrp = vld1q_u8(rp);
2828+      uint8x16_t b = vld1q_u8(pp);
2829+      uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGB);
2830+      uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGB);
2831+      tmp_a = qrp;
2832+      tmp_c = b;
2833+
2834+      int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)));
2835+      int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a)));
2836+      int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)));
2837+      int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)));
2838+      int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c)));
2839+      int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c)));
2840+
2841+      int16x8_t p = vsubq_s16(b_hign, c_hign);
2842+      int16x8_t pc = vsubq_s16(a_hign, c_hign);
2843+      int16x8_t pa = vabsq_s16(p);
2844+      int16x8_t pb = vabsq_s16(pc);
2845+      pc = vabsq_s16(vaddq_s16(p, pc));
2846+      uint16x8_t p1_u = vcleq_s16(pa, pb);
2847+      uint16x8_t pa_u = vcleq_s16(pa, pc);
2848+      uint16x8_t pb_u = vcleq_s16(pb, pc);
2849+      p1_u = vandq_u16(p1_u, pa_u);
2850+      uint8x8_t d_hign = vmovn_u16(pb_u);
2851+      uint8x8_t e_hign = vmovn_u16(p1_u);
2852+
2853+      p = vsubq_s16(b_low, c_low);
2854+      pc = vsubq_s16(a_low, c_low);
2855+      pa = vabsq_s16(p);
2856+      pb = vabsq_s16(pc);
2857+      pc = vabsq_s16(vaddq_s16(p, pc));
2858+      p1_u = vcleq_s16(pa, pb);
2859+      pa_u = vcleq_s16(pa, pc);
2860+      pb_u = vcleq_s16(pb, pc);
2861+      p1_u = vandq_u16(p1_u, pa_u);
2862+      uint8x8_t d_low = vmovn_u16(pb_u);
2863+      uint8x8_t e_low = vmovn_u16(p1_u);
2864+
2865+      uint8x16_t d = vcombine_u8(d_low, d_hign);
2866+      uint8x16_t e = vcombine_u8(e_low, e_hign);
2867+      d = vbslq_u8(d, b, c);
2868+      e = vbslq_u8(e, a, d);
2869+
2870+      uint8x16_t qdp = vsubq_u8(qrp, e);
2871+      vst1q_u8(dp, qdp);
2872+      int8x16_t v_s = vreinterpretq_s8_u8(qdp);
2873+      v_s = vabsq_s8(v_s);
2874+      uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
2875+      sum += vaddlvq_u8(v_u);
2876+
2877+      rp += STEP;
2878+      pp += STEP;
2879+      dp += STEP;
2880+      count -= STEP;
2881+      if (sum > lmins)
2882+      {
2883+         return sum;
2884+      }
2885+   }
2886+
2887+   if (count == row_bytes)
2888+   {
2889+      dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff);
2890+      dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff);
2891+      dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff);
2892+      sum += MID - abs((int)dp[0] - MID);
2893+      sum += MID - abs((int)dp[1] - MID);
2894+      sum += MID - abs((int)dp[2] - MID);
2895+      rp += BYTE_RGB;
2896+      pp += BYTE_RGB;
2897+      dp += BYTE_RGB;
2898+      count -= BYTE_RGB;
2899+   }
2900+
2901+   png_bytep cp = pp - BYTE_RGB;
2902+   png_bytep lp = rp - BYTE_RGB;
2903+   while (count > 0)
2904+   {
2905+      int a, b, c, pa, pb, pc, p;
2906+
2907+      b = *pp++;
2908+      c = *cp++;
2909+      a = *lp++;
2910+
2911+      p = b - c;
2912+      pc = a - c;
2913+
2914+      pa = abs(p);
2915+      pb = abs(pc);
2916+      pc = abs(p + pc);
2917+
2918+      p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
2919+      *dp = (png_byte)(((int)*rp++ - p) & 0xff);
2920+
2921+      count--;
2922+      sum += MID - abs((int)*dp++ - MID);
2923+      if (sum > lmins)
2924+      {
2925+         return sum;
2926+      }
2927+   }
2928+   return sum;
2929+}
2930+
2931+void png_write_filter_paeth3_neon_only(png_structrp png_ptr,
2932+    size_t row_bytes)
2933+{
2934+   png_bytep rp = png_ptr->row_buf + 1;
2935+   png_bytep pp = png_ptr->prev_row + 1;
2936+   png_bytep dp = png_ptr->try_row + 1;
2937+   png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH;
2938+
2939+   size_t count = row_bytes;
2940+   uint8x16_t tmp_a = vdupq_n_u8(0);
2941+   uint8x16_t tmp_c = vdupq_n_u8(0);
2942+   while (count >= STEP)
2943+   {
2944+      uint8x16_t qrp = vld1q_u8(rp);
2945+      uint8x16_t b = vld1q_u8(pp);
2946+      uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGB);
2947+      uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGB);
2948+      tmp_a = qrp;
2949+      tmp_c = b;
2950+
2951+      int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)));
2952+      int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a)));
2953+      int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)));
2954+      int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)));
2955+      int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c)));
2956+      int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c)));
2957+
2958+      int16x8_t p = vsubq_s16(b_hign, c_hign);
2959+      int16x8_t pc = vsubq_s16(a_hign, c_hign);
2960+      int16x8_t pa = vabsq_s16(p);
2961+      int16x8_t pb = vabsq_s16(pc);
2962+      pc = vabsq_s16(vaddq_s16(p, pc));
2963+      uint16x8_t p1_u = vcleq_s16(pa, pb);
2964+      uint16x8_t pa_u = vcleq_s16(pa, pc);
2965+      uint16x8_t pb_u = vcleq_s16(pb, pc);
2966+      p1_u = vandq_u16(p1_u, pa_u);
2967+      uint8x8_t d_hign = vmovn_u16(pb_u);
2968+      uint8x8_t e_hign = vmovn_u16(p1_u);
2969+
2970+      p = vsubq_s16(b_low, c_low);
2971+      pc = vsubq_s16(a_low, c_low);
2972+      pa = vabsq_s16(p);
2973+      pb = vabsq_s16(pc);
2974+      pc = vabsq_s16(vaddq_s16(p, pc));
2975+      p1_u = vcleq_s16(pa, pb);
2976+      pa_u = vcleq_s16(pa, pc);
2977+      pb_u = vcleq_s16(pb, pc);
2978+      p1_u = vandq_u16(p1_u, pa_u);
2979+      uint8x8_t d_low = vmovn_u16(pb_u);
2980+      uint8x8_t e_low = vmovn_u16(p1_u);
2981+
2982+      uint8x16_t d = vcombine_u8(d_low, d_hign);
2983+      uint8x16_t e = vcombine_u8(e_low, e_hign);
2984+      d = vbslq_u8(d, b, c);
2985+      e = vbslq_u8(e, a, d);
2986+
2987+      uint8x16_t qdp = vsubq_u8(qrp, e);
2988+      vst1q_u8(dp, qdp);
2989+
2990+      rp += STEP;
2991+      pp += STEP;
2992+      dp += STEP;
2993+      count -= STEP;
2994+   }
2995+
2996+   if (count == row_bytes)
2997+   {
2998+      dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff);
2999+      dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff);
3000+      dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff);
3001+      rp += BYTE_RGB;
3002+      pp += BYTE_RGB;
3003+      dp += BYTE_RGB;
3004+      count -= BYTE_RGB;
3005+   }
3006+
3007+   png_bytep cp = pp - BYTE_RGB;
3008+   png_bytep lp = rp - BYTE_RGB;
3009+   while (count > 0)
3010+   {
3011+      int a, b, c, pa, pb, pc, p;
3012+
3013+      b = *pp++;
3014+      c = *cp++;
3015+      a = *lp++;
3016+
3017+      p = b - c;
3018+      pc = a - c;
3019+
3020+      pa = abs(p);
3021+      pb = abs(pc);
3022+      pc = abs(p + pc);
3023+
3024+      p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3025+      *dp++ = (png_byte)(((int)*rp++ - p) & 0xff);
3026+      count--;
3027+   }
3028+}
3029+
3030+size_t png_write_filter_paeth4_neon(png_structrp png_ptr,
3031+    size_t row_bytes, size_t lmins)
3032+{
3033+   size_t sum = 0;
3034+   png_bytep rp = png_ptr->row_buf + 1;
3035+   png_bytep pp = png_ptr->prev_row + 1;
3036+   png_bytep dp = png_ptr->try_row + 1;
3037+   png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH;
3038+
3039+   size_t count = row_bytes;
3040+   uint8x16_t tmp_a = vdupq_n_u8(0);
3041+   uint8x16_t tmp_c = vdupq_n_u8(0);
3042+   while (count >= STEP)
3043+   {
3044+      uint8x16_t qrp = vld1q_u8(rp);
3045+      uint8x16_t b = vld1q_u8(pp);
3046+      uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGBA);
3047+      uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGBA);
3048+      tmp_a = qrp;
3049+      tmp_c = b;
3050+
3051+      int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)));
3052+      int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a)));
3053+      int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)));
3054+      int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)));
3055+      int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c)));
3056+      int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c)));
3057+
3058+      int16x8_t p = vsubq_s16(b_hign, c_hign);
3059+      int16x8_t pc = vsubq_s16(a_hign, c_hign);
3060+      int16x8_t pa = vabsq_s16(p);
3061+      int16x8_t pb = vabsq_s16(pc);
3062+      pc = vabsq_s16(vaddq_s16(p, pc));
3063+      uint16x8_t p1_u = vcleq_s16(pa, pb);
3064+      uint16x8_t pa_u = vcleq_s16(pa, pc);
3065+      uint16x8_t pb_u = vcleq_s16(pb, pc);
3066+      p1_u = vandq_u16(p1_u, pa_u);
3067+      uint8x8_t d_hign = vmovn_u16(pb_u);
3068+      uint8x8_t e_hign = vmovn_u16(p1_u);
3069+
3070+      p = vsubq_s16(b_low, c_low);
3071+      pc = vsubq_s16(a_low, c_low);
3072+      pa = vabsq_s16(p);
3073+      pb = vabsq_s16(pc);
3074+      pc = vabsq_s16(vaddq_s16(p, pc));
3075+      p1_u = vcleq_s16(pa, pb);
3076+      pa_u = vcleq_s16(pa, pc);
3077+      pb_u = vcleq_s16(pb, pc);
3078+      p1_u = vandq_u16(p1_u, pa_u);
3079+      uint8x8_t d_low = vmovn_u16(pb_u);
3080+      uint8x8_t e_low = vmovn_u16(p1_u);
3081+
3082+      uint8x16_t d = vcombine_u8(d_low, d_hign);
3083+      uint8x16_t e = vcombine_u8(e_low, e_hign);
3084+      d = vbslq_u8(d, b, c);
3085+      e = vbslq_u8(e, a, d);
3086+
3087+      uint8x16_t qdp = vsubq_u8(qrp, e);
3088+      vst1q_u8(dp, qdp);
3089+      int8x16_t v_s = vreinterpretq_s8_u8(qdp);
3090+      v_s = vabsq_s8(v_s);
3091+      uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
3092+      sum += vaddlvq_u8(v_u);
3093+
3094+      rp += STEP;
3095+      pp += STEP;
3096+      dp += STEP;
3097+      count -= STEP;
3098+      if (sum > lmins)
3099+      {
3100+         return sum;
3101+      }
3102+   }
3103+
3104+   if (count == row_bytes)
3105+   {
3106+      dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff);
3107+      dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff);
3108+      dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff);
3109+      dp[3] = (png_byte)(((int)rp[3] - (int)pp[3]) & 0xff);
3110+      sum += MID - abs((int)dp[0] - MID);
3111+      sum += MID - abs((int)dp[1] - MID);
3112+      sum += MID - abs((int)dp[2] - MID);
3113+      sum += MID - abs((int)dp[3] - MID);
3114+      rp += BYTE_RGBA;
3115+      pp += BYTE_RGBA;
3116+      dp += BYTE_RGBA;
3117+      count -= BYTE_RGBA;
3118+   }
3119+
3120+   png_bytep cp = pp - BYTE_RGBA;
3121+   png_bytep lp = rp - BYTE_RGBA;
3122+   while (count > 0)
3123+   {
3124+      int a, b, c, pa, pb, pc, p;
3125+
3126+      b = *pp++;
3127+      c = *cp++;
3128+      a = *lp++;
3129+
3130+      p = b - c;
3131+      pc = a - c;
3132+
3133+      pa = abs(p);
3134+      pb = abs(pc);
3135+      pc = abs(p + pc);
3136+
3137+      p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3138+      *dp = (png_byte)(((int)*rp++ - p) & 0xff);
3139+
3140+      count--;
3141+      sum += MID - abs((int)*dp++ - MID);
3142+      if (sum > lmins)
3143+      {
3144+         return sum;
3145+      }
3146+   }
3147+   return sum;
3148+}
3149+
3150+void png_write_filter_paeth4_neon_only(png_structrp png_ptr,
3151+    size_t row_bytes)
3152+{
3153+   png_bytep rp = png_ptr->row_buf + 1;
3154+   png_bytep pp = png_ptr->prev_row + 1;
3155+   png_bytep dp = png_ptr->try_row + 1;
3156+   png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH;
3157+
3158+   size_t count = row_bytes;
3159+   uint8x16_t tmp_a = vdupq_n_u8(0);
3160+   uint8x16_t tmp_c = vdupq_n_u8(0);
3161+   while (count >= STEP)
3162+   {
3163+      uint8x16_t qrp = vld1q_u8(rp);
3164+      uint8x16_t b = vld1q_u8(pp);
3165+      uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGBA);
3166+      uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGBA);
3167+      tmp_a = qrp;
3168+      tmp_c = b;
3169+
3170+      int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)));
3171+      int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a)));
3172+      int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)));
3173+      int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)));
3174+      int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c)));
3175+      int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c)));
3176+
3177+      int16x8_t p = vsubq_s16(b_hign, c_hign);
3178+      int16x8_t pc = vsubq_s16(a_hign, c_hign);
3179+      int16x8_t pa = vabsq_s16(p);
3180+      int16x8_t pb = vabsq_s16(pc);
3181+      pc = vabsq_s16(vaddq_s16(p, pc));
3182+      uint16x8_t p1_u = vcleq_s16(pa, pb);
3183+      uint16x8_t pa_u = vcleq_s16(pa, pc);
3184+      uint16x8_t pb_u = vcleq_s16(pb, pc);
3185+      p1_u = vandq_u16(p1_u, pa_u);
3186+      uint8x8_t d_hign = vmovn_u16(pb_u);
3187+      uint8x8_t e_hign = vmovn_u16(p1_u);
3188+
3189+      p = vsubq_s16(b_low, c_low);
3190+      pc = vsubq_s16(a_low, c_low);
3191+      pa = vabsq_s16(p);
3192+      pb = vabsq_s16(pc);
3193+      pc = vabsq_s16(vaddq_s16(p, pc));
3194+      p1_u = vcleq_s16(pa, pb);
3195+      pa_u = vcleq_s16(pa, pc);
3196+      pb_u = vcleq_s16(pb, pc);
3197+      p1_u = vandq_u16(p1_u, pa_u);
3198+      uint8x8_t d_low = vmovn_u16(pb_u);
3199+      uint8x8_t e_low = vmovn_u16(p1_u);
3200+
3201+      uint8x16_t d = vcombine_u8(d_low, d_hign);
3202+      uint8x16_t e = vcombine_u8(e_low, e_hign);
3203+      d = vbslq_u8(d, b, c);
3204+      e = vbslq_u8(e, a, d);
3205+
3206+      uint8x16_t qdp = vsubq_u8(qrp, e);
3207+      vst1q_u8(dp, qdp);
3208+
3209+      rp += STEP;
3210+      pp += STEP;
3211+      dp += STEP;
3212+      count -= STEP;
3213+   }
3214+
3215+   if (count == row_bytes)
3216+   {
3217+      dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff);
3218+      dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff);
3219+      dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff);
3220+      dp[3] = (png_byte)(((int)rp[3] - (int)pp[3]) & 0xff);
3221+      rp += BYTE_RGBA;
3222+      pp += BYTE_RGBA;
3223+      dp += BYTE_RGBA;
3224+      count -= BYTE_RGBA;
3225+   }
3226+
3227+   png_bytep cp = pp - BYTE_RGBA;
3228+   png_bytep lp = rp - BYTE_RGBA;
3229+   while (count > 0)
3230+   {
3231+      int a, b, c, pa, pb, pc, p;
3232+
3233+      b = *pp++;
3234+      c = *cp++;
3235+      a = *lp++;
3236+
3237+      p = b - c;
3238+      pc = a - c;
3239+
3240+      pa = abs(p);
3241+      pb = abs(pc);
3242+      pc = abs(p + pc);
3243+
3244+      p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3245+      *dp++ = (png_byte)(((int)*rp++ - p) & 0xff);
3246+      count--;
3247+   }
3248+}
3249+#endif
3250+
3251 static size_t /* PRIVATE */
3252 png_setup_sub_row(png_structrp png_ptr, png_uint_32 bpp,
3253     size_t row_bytes, size_t lmins)
3254 {
3255+#ifdef PNG_WRITE_NEON_ENABLE
3256+   if (bpp == 3)
3257+   {
3258+      return png_write_filter_sub3_neon(png_ptr, row_bytes, lmins);
3259+   }
3260+   if (bpp == 4)
3261+   {
3262+      return png_write_filter_sub4_neon(png_ptr, row_bytes, lmins);
3263+   }
3264+#endif
3265    png_bytep rp, dp, lp;
3266    size_t i;
3267    size_t sum = 0;
3268@@ -2318,6 +3261,16 @@ static void /* PRIVATE */
3269 png_setup_sub_row_only(png_structrp png_ptr, png_uint_32 bpp,
3270     size_t row_bytes)
3271 {
3272+#ifdef PNG_WRITE_NEON_ENABLE
3273+   if (bpp == 3)
3274+   {
3275+      return png_write_filter_sub3_neon_only(png_ptr, row_bytes);
3276+   }
3277+   if (bpp == 4)
3278+   {
3279+      return png_write_filter_sub4_neon_only(png_ptr, row_bytes);
3280+   }
3281+#endif
3282    png_bytep rp, dp, lp;
3283    size_t i;
3284
3285@@ -2339,6 +3292,9 @@ png_setup_sub_row_only(png_structrp png_ptr, png_uint_32 bpp,
3286 static size_t /* PRIVATE */
3287 png_setup_up_row(png_structrp png_ptr, size_t row_bytes, size_t lmins)
3288 {
3289+#ifdef PNG_WRITE_NEON_ENABLE
3290+   return png_write_filter_up_neon(png_ptr, row_bytes, lmins);
3291+#endif
3292    png_bytep rp, dp, pp;
3293    size_t i;
3294    size_t sum = 0;
3295@@ -2366,6 +3322,9 @@ png_setup_up_row(png_structrp png_ptr, size_t row_bytes, size_t lmins)
3296 static void /* PRIVATE */
3297 png_setup_up_row_only(png_structrp png_ptr, size_t row_bytes)
3298 {
3299+#ifdef PNG_WRITE_NEON_ENABLE
3300+   return png_write_filter_up_neon_only(png_ptr, row_bytes);
3301+#endif
3302    png_bytep rp, dp, pp;
3303    size_t i;
3304
3305@@ -2383,6 +3342,16 @@ static size_t /* PRIVATE */
3306 png_setup_avg_row(png_structrp png_ptr, png_uint_32 bpp,
3307     size_t row_bytes, size_t lmins)
3308 {
3309+#ifdef PNG_WRITE_NEON_ENABLE
3310+   if (bpp == 3)
3311+   {
3312+      return png_write_filter_avg3_neon(png_ptr, row_bytes, lmins);
3313+   }
3314+   if (bpp == 4)
3315+   {
3316+      return png_write_filter_avg4_neon(png_ptr, row_bytes, lmins);
3317+   }
3318+#endif
3319    png_bytep rp, dp, pp, lp;
3320    png_uint_32 i;
3321    size_t sum = 0;
3322@@ -2423,6 +3392,16 @@ static void /* PRIVATE */
3323 png_setup_avg_row_only(png_structrp png_ptr, png_uint_32 bpp,
3324     size_t row_bytes)
3325 {
3326+#ifdef PNG_WRITE_NEON_ENABLE
3327+   if (bpp == 3)
3328+   {
3329+      return png_write_filter_avg3_neon_only(png_ptr, row_bytes);
3330+   }
3331+   if (bpp == 4)
3332+   {
3333+      return png_write_filter_avg4_neon_only(png_ptr, row_bytes);
3334+   }
3335+#endif
3336    png_bytep rp, dp, pp, lp;
3337    png_uint_32 i;
3338
3339@@ -2445,6 +3424,16 @@ static size_t /* PRIVATE */
3340 png_setup_paeth_row(png_structrp png_ptr, png_uint_32 bpp,
3341     size_t row_bytes, size_t lmins)
3342 {
3343+#ifdef PNG_WRITE_NEON_ENABLE
3344+   if (bpp == 3)
3345+   {
3346+      return png_write_filter_paeth3_neon(png_ptr, row_bytes, lmins);
3347+   }
3348+   if (bpp == 4)
3349+   {
3350+      return png_write_filter_paeth4_neon(png_ptr, row_bytes, lmins);
3351+   }
3352+#endif
3353    png_bytep rp, dp, pp, cp, lp;
3354    size_t i;
3355    size_t sum = 0;
3356@@ -2506,6 +3495,16 @@ static void /* PRIVATE */
3357 png_setup_paeth_row_only(png_structrp png_ptr, png_uint_32 bpp,
3358     size_t row_bytes)
3359 {
3360+#ifdef PNG_WRITE_NEON_ENABLE
3361+   if (bpp == 3)
3362+   {
3363+      return png_write_filter_paeth3_neon_only(png_ptr, row_bytes);
3364+   }
3365+   if (bpp == 4)
3366+   {
3367+      return png_write_filter_paeth4_neon_only(png_ptr, row_bytes);
3368+   }
3369+#endif
3370    png_bytep rp, dp, pp, cp, lp;
3371    size_t i;
3372
3373@@ -2613,6 +3612,25 @@ png_write_find_filter(png_structrp png_ptr, png_row_infop row_info)
3374        */
3375       png_bytep rp;
3376       size_t sum = 0;
3377+#ifdef PNG_WRITE_NEON_ENABLE
3378+      size_t bytes = row_info->rowbytes;
3379+      rp = row_buf + 1;
3380+      while (bytes >= STEP)
3381+      {
3382+         uint8x16_t v = vld1q_u8(rp);
3383+         int8x16_t v_s = vreinterpretq_s8_u8(v);
3384+         v_s = vabsq_s8(v_s);
3385+         v = vreinterpretq_u8_s8(v_s);
3386+         sum += vaddlvq_u8(v);
3387+         rp += STEP;
3388+         bytes -= STEP;
3389+      }
3390+      while (bytes > 0)
3391+      {
3392+         sum += 128 - abs((int)*rp++ - 128);
3393+         bytes--;
3394+      }
3395+#else
3396       size_t i;
3397       unsigned int v;
3398
3399@@ -2627,7 +3645,7 @@ png_write_find_filter(png_structrp png_ptr, png_row_infop row_info)
3400 #endif
3401          }
3402       }
3403-
3404+#endif
3405       mins = sum;
3406    }
3407
3408