• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1diff --git a/arm/arm_init.c b/arm/arm_init.c
2index ab22525..af40b2b 100644
3--- a/arm/arm_init.c
4+++ b/arm/arm_init.c
5@@ -115,13 +115,21 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp)
6     * initialization function.)
7     */
8    pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon;
9-
10+#ifdef PNG_MULTY_LINE_ENABLE
11+   pp->read_filter[PNG_FILTER_VALUE_UP_X2-1] = png_read_filter_row_up_x2_neon;
12+#endif
13    if (bpp == 3)
14    {
15       pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon;
16       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon;
17       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
18          png_read_filter_row_paeth3_neon;
19+#ifdef PNG_MULTY_LINE_ENABLE
20+      pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] =
21+         png_read_filter_row_avg3_x2_neon;
22+      pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] =
23+         png_read_filter_row_paeth3_x2_neon;
24+#endif
25    }
26
27    else if (bpp == 4)
28@@ -130,6 +130,12 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp)
29       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon;
30       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
31           png_read_filter_row_paeth4_neon;
32+#ifdef PNG_MULTY_LINE_ENABLE
33+      pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] =
34+         png_read_filter_row_avg4_x2_neon;
35+      pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] =
36+         png_read_filter_row_paeth4_x2_neon;
37+#endif
38    }
39 }
40 #endif /* PNG_ARM_NEON_OPT > 0 */
41diff --git a/arm/filter_neon_intrinsics.c b/arm/filter_neon_intrinsics.c
42index 4466d48..f11286a 100644
43--- a/arm/filter_neon_intrinsics.c
44+++ b/arm/filter_neon_intrinsics.c
45@@ -52,21 +52,90 @@ png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
46    png_const_bytep prev_row)
47 {
48    png_bytep rp = row;
49-   png_bytep rp_stop = row + row_info->rowbytes;
50    png_const_bytep pp = prev_row;
51+   int count = row_info->rowbytes;
52
53    png_debug(1, "in png_read_filter_row_up_neon");
54
55-   for (; rp < rp_stop; rp += 16, pp += 16)
56-   {
57-      uint8x16_t qrp, qpp;
58+   uint8x16_t qrp, qpp;
59+   while (count >= 16) {
60+      qrp = vld1q_u8(rp);
61+      qpp = vld1q_u8(pp);
62+      qrp = vaddq_u8(qrp, qpp);
63+      vst1q_u8(rp, qrp);
64+      rp += 16;
65+      pp += 16;
66+      count -= 16;
67+   }
68+
69+   if (count >= 8) {
70+      uint8x8_t qrp1, qpp1;
71+      qrp1 = vld1_u8(rp);
72+      qpp1 = vld1_u8(pp);
73+      qrp1 = vadd_u8(qrp1, qpp1);
74+      vst1_u8(rp, qrp1);
75+      rp += 8;
76+      pp += 8;
77+      count -= 8;
78+   }
79+
80+   int i = 0;
81+   for (i = 0; i < count; i++) {
82+      *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
83+      rp++;
84+   }
85+}
86+
87+#ifdef PNG_MULTY_LINE_ENABLE
88+void
89+png_read_filter_row_up_x2_neon(png_row_infop row_info, png_bytep row,
90+   png_const_bytep prev_row)
91+{
92+   png_bytep rp = row;
93+   png_const_bytep pp = prev_row;
94+   int count = row_info->rowbytes;
95+   png_bytep np = row + row_info->rowbytes + 1;
96+
97+   png_debug(1, "in png_read_filter_row_up_x2_neon");
98
99+   uint8x16_t qrp, qpp, qnp;
100+   while (count >= 16) {
101       qrp = vld1q_u8(rp);
102       qpp = vld1q_u8(pp);
103+      qnp = vld1q_u8(np);
104       qrp = vaddq_u8(qrp, qpp);
105+      qnp = vaddq_u8(qnp, qrp);
106       vst1q_u8(rp, qrp);
107+      vst1q_u8(np, qnp);
108+      rp += 16;
109+      pp += 16;
110+      np += 16;
111+      count -= 16;
112+   }
113+
114+   if (count >= 8) {
115+      uint8x8_t qrp1, qpp1, qnp1;
116+      qrp1 = vld1_u8(rp);
117+      qpp1 = vld1_u8(pp);
118+      qnp1 = vld1_u8(np);
119+      qrp1 = vadd_u8(qrp1, qpp1);
120+      qnp1 = vadd_u8(qnp1, qrp1);
121+      vst1_u8(rp, qrp1);
122+      vst1_u8(np, qnp1);
123+      rp += 8;
124+      pp += 8;
125+      np += 8;
126+      count -= 8;
127+   }
128+
129+   int i = 0;
130+   for (i = 0; i < count; i++) {
131+      *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
132+      *np = (png_byte)(((int)(*np) + (int)(*rp++)) & 0xff);
133+      np++;
134    }
135 }
136+#endif
137
138 void
139 png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
140@@ -82,13 +151,16 @@ png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
141    uint8x8x4_t vdest;
142    vdest.val[3] = vdup_n_u8(0);
143
144+   uint8x8_t vtmp1, vtmp2;
145+   uint32x2_t *temp_pointer;
146+
147    png_debug(1, "in png_read_filter_row_sub3_neon");
148
149-   for (; rp < rp_stop;)
150+   size_t tail_bytes = row_info->rowbytes % 12;
151+   png_byte last_byte = *rp_stop;
152+   png_bytep rp_stop_new = rp_stop - tail_bytes;
153+   for (; rp < rp_stop_new;)
154    {
155-      uint8x8_t vtmp1, vtmp2;
156-      uint32x2_t *temp_pointer;
157-
158       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
159       vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
160       vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
161@@ -112,6 +184,32 @@ png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
162       rp += 3;
163    }
164
165+   if (tail_bytes == 3) {
166+      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
167+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
168+   } else if (tail_bytes == 6) {
169+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
170+      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
171+      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
172+
173+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
174+      rp += 3;
175+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
176+   } else if (tail_bytes == 9) {
177+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
178+      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
179+      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
180+      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
181+      vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
182+
183+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
184+      rp += 3;
185+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
186+      rp += 3;
187+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
188+   }
189+   *rp_stop = last_byte;
190+
191    PNG_UNUSED(prev_row)
192 }
193
194@@ -120,20 +218,22 @@ png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
195    png_const_bytep prev_row)
196 {
197    png_bytep rp = row;
198-   png_bytep rp_stop = row + row_info->rowbytes;
199+   int count = row_info->rowbytes;
200
201    uint8x8x4_t vdest;
202    vdest.val[3] = vdup_n_u8(0);
203
204    png_debug(1, "in png_read_filter_row_sub4_neon");
205
206-   for (; rp < rp_stop; rp += 16)
207-   {
208-      uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
209-      uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
210-      uint8x8x4_t vrp = *vrpt;
211+   uint32x2x4_t vtmp;
212+   uint8x8x4_t *vrpt;
213+   uint8x8x4_t vrp;
214+   uint32x2x4_t vdest_val;
215+   while (count >= 16) {
216       uint32x2x4_t *temp_pointer;
217-      uint32x2x4_t vdest_val;
218+      vtmp = vld4_u32(png_ptr(uint32_t,rp));
219+      vrpt = png_ptr(uint8x8x4_t,&vtmp);
220+      vrp = *vrpt;
221
222       vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
223       vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
224@@ -142,8 +242,42 @@ png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
225
226       vdest_val = png_ldr(uint32x2x4_t, &vdest);
227       vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
228+
229+      rp += 16;
230+      count -= 16;
231    }
232
233+   if (count >= 8) {
234+      uint32x2x2_t vtmp1 = vld2_u32(png_ptr(uint32_t,rp));
235+      uint8x8x2_t *vrpt1 = png_ptr(uint8x8x2_t,&vtmp1);
236+      uint8x8x2_t vrp1 = *vrpt1;
237+      uint32x2x2_t *temp_pointer;
238+      uint32x2x2_t vdest_val1;
239+
240+      vdest.val[0] = vadd_u8(vdest.val[3], vrp1.val[0]);
241+      vdest.val[1] = vadd_u8(vdest.val[0], vrp1.val[1]);
242+
243+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
244+      vst2_lane_u32(png_ptr(uint32_t,rp), vdest_val1, 0);
245+
246+      rp += 8;
247+      count -= 8;
248+   }
249+
250+   if (count == 0) {
251+      return;
252+   }
253+
254+   uint32x2_t vtmp2 = vld1_u32(png_ptr(uint32_t,rp));
255+   uint8x8_t *vrpt2 = png_ptr(uint8x8_t,&vtmp2);
256+   uint8x8_t vrp2 = *vrpt2;
257+   uint32x2_t *temp_pointer;
258+   uint32x2_t vdest_val2;
259+
260+   vdest.val[0] = vadd_u8(vdest.val[1], vrp2);
261+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
262+   vst1_lane_u32(png_ptr(uint32_t,rp), vdest_val2, 0);
263+
264    PNG_UNUSED(prev_row)
265 }
266
267@@ -167,15 +301,140 @@ png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
268
269    png_debug(1, "in png_read_filter_row_avg3_neon");
270
271-   for (; rp < rp_stop; pp += 12)
272+   uint8x8_t vtmp1, vtmp2, vtmp3;
273+   uint8x8x2_t *vppt;
274+   uint8x8x2_t vpp;
275+   uint32x2_t *temp_pointer;
276+
277+   size_t tail_bytes = row_info->rowbytes % 12;
278+   png_byte last_byte = *rp_stop;
279+   png_bytep rp_stop_new = rp_stop - tail_bytes;
280+   for (; rp < rp_stop_new; pp += 12)
281    {
282-      uint8x8_t vtmp1, vtmp2, vtmp3;
283+      vtmp = vld1q_u8(pp);
284+      vppt = png_ptr(uint8x8x2_t,&vtmp);
285+      vpp = *vppt;
286+
287+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
288+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
289+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
290+
291+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
292+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
293+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
294+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
295+
296+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
297+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
298+
299+      vtmp = vld1q_u8(rp + 12);
300+      vrpt = png_ptr(uint8x8x2_t,&vtmp);
301+      vrp = *vrpt;
302+
303+      vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
304+      vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
305+
306+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
307+
308+      vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
309+      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
310+
311+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
312+      rp += 3;
313+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
314+      rp += 3;
315+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
316+      rp += 3;
317+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
318+      rp += 3;
319+   }
320+
321+   vtmp = vld1q_u8(pp);
322+   vppt = png_ptr(uint8x8x2_t,&vtmp);
323+   vpp = *vppt;
324+
325+   if (tail_bytes == 3) {
326+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
327+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
328+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
329+   } else if (tail_bytes == 6) {
330+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
331+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
332+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
333+
334+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
335+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
336+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
337+
338+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
339+      rp += 3;
340+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
341+   } else if (tail_bytes == 9) {
342+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
343+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
344+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
345+
346+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
347+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
348+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
349+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
350+
351+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
352+
353+      vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
354+      vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
355+
356+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
357+      rp += 3;
358+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
359+      rp += 3;
360+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
361+   }
362+   *rp_stop = last_byte;
363+}
364+
365+#ifdef PNG_MULTY_LINE_ENABLE
366+void
367+png_read_filter_row_avg3_x2_neon(png_row_infop row_info, png_bytep row,
368+   png_const_bytep prev_row)
369+{
370+   png_bytep rp = row;
371+   png_const_bytep pp = prev_row;
372+   png_bytep rp_stop = row + row_info->rowbytes;
373+   png_bytep np = rp_stop + 1;
374+
375+   uint8x16_t vtmp;
376+   uint8x8x2_t *vrpt;
377+   uint8x8x2_t vrp;
378+   uint8x8x4_t vdest;
379+   vdest.val[3] = vdup_n_u8(0);
380+
381+   vtmp = vld1q_u8(rp);
382+   vrpt = png_ptr(uint8x8x2_t,&vtmp);
383+   vrp = *vrpt;
384+
385+   uint8x8x2_t *vnpt;
386+   uint8x8x2_t vnp;
387+   uint8x8x4_t vdestN;
388+   vdestN.val[3] = vdup_n_u8(0);
389+
390+   vtmp = vld1q_u8(np);
391+   vnpt = png_ptr(uint8x8x2_t,&vtmp);
392+   vnp = *vnpt;
393
394-      uint8x8x2_t *vppt;
395-      uint8x8x2_t vpp;
396+   png_debug(1, "in png_read_filter_row_x2_avg3_neon");
397
398-      uint32x2_t *temp_pointer;
399+   uint8x8_t vtmp1, vtmp2, vtmp3;
400+   uint8x8x2_t *vppt;
401+   uint8x8x2_t vpp;
402+   uint32x2_t *temp_pointer;
403
404+   size_t tail_bytes = row_info->rowbytes % 12;
405+   png_byte last_byte = *rp_stop;
406+   png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1);
407+   png_bytep rp_stop_new = rp_stop - tail_bytes;
408+   for (; rp < rp_stop_new; pp += 12)
409+   {
410       vtmp = vld1q_u8(pp);
411       vppt = png_ptr(uint8x8x2_t,&vtmp);
412       vpp = *vppt;
413@@ -212,36 +471,245 @@ png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
414       rp += 3;
415       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
416       rp += 3;
417+
418+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3);
419+      vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]);
420+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
421+
422+      vtmp3 = vext_u8(vnp.val[0], vnp.val[1], 6);
423+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
424+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
425+
426+      vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1);
427+
428+      vtmp = vld1q_u8(np + 12);
429+      vnpt = png_ptr(uint8x8x2_t,&vtmp);
430+      vnp = *vnpt;
431+
432+      vdestN.val[2] = vhadd_u8(vdestN.val[1], vdest.val[2]);
433+      vdestN.val[2] = vadd_u8(vdestN.val[2], vtmp3);
434+
435+      vdestN.val[3] = vhadd_u8(vdestN.val[2], vdest.val[3]);
436+      vdestN.val[3] = vadd_u8(vdestN.val[3], vtmp1);
437+
438+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0);
439+      np += 3;
440+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0);
441+      np += 3;
442+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[2]), 0);
443+      np += 3;
444+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[3]), 0);
445+      np += 3;
446    }
447+
448+   vtmp = vld1q_u8(pp);
449+   vppt = png_ptr(uint8x8x2_t,&vtmp);
450+   vpp = *vppt;
451+
452+   if (tail_bytes == 3) {
453+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
454+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
455+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
456+
457+      vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]);
458+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
459+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0);
460+   } else if (tail_bytes == 6) {
461+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
462+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
463+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
464+
465+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
466+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
467+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
468+
469+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
470+      rp += 3;
471+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
472+
473+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3);
474+      vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]);
475+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
476+
477+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
478+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
479+
480+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0);
481+      np += 3;
482+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0);
483+   } else if (tail_bytes == 9) {
484+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
485+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
486+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
487+
488+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
489+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
490+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
491+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
492+
493+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
494+
495+      vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
496+      vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
497+
498+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
499+      rp += 3;
500+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
501+      rp += 3;
502+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
503+
504+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3);
505+      vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]);
506+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
507+
508+      vtmp3 = vext_u8(vnp.val[0], vnp.val[1], 6);
509+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
510+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
511+
512+      vdestN.val[2] = vhadd_u8(vdestN.val[1], vdest.val[2]);
513+      vdestN.val[2] = vadd_u8(vdestN.val[2], vtmp3);
514+
515+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0);
516+      np += 3;
517+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0);
518+      np += 3;
519+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[2]), 0);
520+   }
521+   *rp_stop = last_byte;
522+   *(rp_stop + row_info->rowbytes + 1) = last_byte_next;
523 }
524+#endif
525
526 void
527 png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
528    png_const_bytep prev_row)
529 {
530    png_bytep rp = row;
531-   png_bytep rp_stop = row + row_info->rowbytes;
532    png_const_bytep pp = prev_row;
533+   int count = row_info->rowbytes;
534
535    uint8x8x4_t vdest;
536    vdest.val[3] = vdup_n_u8(0);
537
538    png_debug(1, "in png_read_filter_row_avg4_neon");
539
540-   for (; rp < rp_stop; rp += 16, pp += 16)
541-   {
542-      uint32x2x4_t vtmp;
543-      uint8x8x4_t *vrpt, *vppt;
544-      uint8x8x4_t vrp, vpp;
545+   uint32x2x4_t vtmp;
546+   uint8x8x4_t *vrpt, *vppt;
547+   uint8x8x4_t vrp, vpp;
548+   uint32x2x4_t vdest_val;
549+   while (count >= 16) {
550       uint32x2x4_t *temp_pointer;
551-      uint32x2x4_t vdest_val;
552+      vtmp = vld4_u32(png_ptr(uint32_t,rp));
553+      vrpt = png_ptr(uint8x8x4_t,&vtmp);
554+      vrp = *vrpt;
555+      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
556+      vppt = png_ptr(uint8x8x4_t,&vtmp);
557+      vpp = *vppt;
558+
559+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
560+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
561+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
562+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
563+      vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
564+      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
565+      vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
566+      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
567+
568+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
569+      vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
570+
571+      rp += 16;
572+      pp += 16;
573+      count -= 16;
574+   }
575+
576+   if (count >= 8) {
577+      uint32x2x2_t vtmp1;
578+      uint8x8x2_t *vrpt1, *vppt1;
579+      uint8x8x2_t vrp1, vpp1;
580+      uint32x2x2_t *temp_pointer;
581+      uint32x2x2_t vdest_val1;
582+
583+      vtmp1 = vld2_u32(png_ptr(uint32_t,rp));
584+      vrpt1 = png_ptr(uint8x8x2_t,&vtmp1);
585+      vrp1 = *vrpt1;
586+      vtmp1 = vld2_u32(png_ptrc(uint32_t,pp));
587+      vppt1 = png_ptr(uint8x8x2_t,&vtmp1);
588+      vpp1 = *vppt1;
589+
590+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp1.val[0]);
591+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
592+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]);
593+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
594+
595+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
596+      vst2_lane_u32(png_ptr(uint32_t,rp), vdest_val1, 0);
597+
598+      rp += 8;
599+      pp += 8;
600+      count -= 8;
601+   }
602+
603+   if (count == 0) {
604+      return;
605+   }
606+
607+   uint32x2_t vtmp2;
608+   uint8x8_t *vrpt2, *vppt2;
609+   uint8x8_t vrp2, vpp2;
610+   uint32x2_t *temp_pointer;
611+   uint32x2_t vdest_val2;
612+
613+   vtmp2 = vld1_u32(png_ptr(uint32_t,rp));
614+   vrpt2 = png_ptr(uint8x8_t,&vtmp2);
615+   vrp2 = *vrpt2;
616+   vtmp2 = vld1_u32(png_ptrc(uint32_t,pp));
617+   vppt2 = png_ptr(uint8x8_t,&vtmp2);
618+   vpp2 = *vppt2;
619
620+   vdest.val[0] = vhadd_u8(vdest.val[1], vpp2);
621+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
622+
623+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
624+   vst1_lane_u32(png_ptr(uint32_t,rp), vdest_val2, 0);
625+}
626+
627+#ifdef PNG_MULTY_LINE_ENABLE
628+void
629+png_read_filter_row_avg4_x2_neon(png_row_infop row_info, png_bytep row,
630+   png_const_bytep prev_row)
631+{
632+   png_bytep rp = row;
633+   png_const_bytep pp = prev_row;
634+   int count = row_info->rowbytes;
635+   png_bytep np = row + count + 1;
636+
637+   uint8x8x4_t vdest;
638+   vdest.val[3] = vdup_n_u8(0);
639+
640+   png_debug(1, "in png_read_filter_row_avg4_x2_neon");
641+
642+   uint32x2x4_t vtmp;
643+   uint8x8x4_t *vrpt, *vppt;
644+   uint8x8x4_t vrp, vpp;
645+   uint32x2x4_t vdest_val;
646+
647+   uint8x8x4_t *vnpt;
648+   uint8x8x4_t vnp;
649+   uint8x8x4_t vdestN;
650+   vdestN.val[3] = vdup_n_u8(0);
651+
652+   while (count >= 16) {
653+      uint32x2x4_t *temp_pointer;
654       vtmp = vld4_u32(png_ptr(uint32_t,rp));
655       vrpt = png_ptr(uint8x8x4_t,&vtmp);
656       vrp = *vrpt;
657       vtmp = vld4_u32(png_ptrc(uint32_t,pp));
658       vppt = png_ptr(uint8x8x4_t,&vtmp);
659       vpp = *vppt;
660+      vtmp = vld4_u32(png_ptrc(uint32_t,np));
661+      vnpt = png_ptr(uint8x8x4_t,&vtmp);
662+      vnp = *vnpt;
663
664       vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
665       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
666@@ -254,8 +722,97 @@ png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
667
668       vdest_val = png_ldr(uint32x2x4_t, &vdest);
669       vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
670+
671+      vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]);
672+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
673+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
674+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]);
675+      vdestN.val[2] = vhadd_u8(vdestN.val[1], vdest.val[2]);
676+      vdestN.val[2] = vadd_u8(vdestN.val[2], vnp.val[2]);
677+      vdestN.val[3] = vhadd_u8(vdestN.val[2], vdest.val[3]);
678+      vdestN.val[3] = vadd_u8(vdestN.val[3], vnp.val[3]);
679+
680+      vdest_val = png_ldr(uint32x2x4_t, &vdestN);
681+      vst4_lane_u32(png_ptr(uint32_t,np), vdest_val, 0);
682+
683+      rp += 16;
684+      pp += 16;
685+      np += 16;
686+      count -= 16;
687+   }
688+
689+   if (count >= 8) {
690+      uint32x2x2_t vtmp1;
691+      uint8x8x2_t *vrpt1, *vppt1, *vnpt1;
692+      uint8x8x2_t vrp1, vpp1, vnp1;
693+      uint32x2x2_t *temp_pointer;
694+      uint32x2x2_t vdest_val1;
695+
696+      vtmp1 = vld2_u32(png_ptr(uint32_t,rp));
697+      vrpt1 = png_ptr(uint8x8x2_t,&vtmp1);
698+      vrp1 = *vrpt1;
699+      vtmp1 = vld2_u32(png_ptrc(uint32_t,pp));
700+      vppt1 = png_ptr(uint8x8x2_t,&vtmp1);
701+      vpp1 = *vppt1;
702+      vtmp1 = vld2_u32(png_ptrc(uint32_t,np));
703+      vnpt1 = png_ptr(uint8x8x2_t,&vtmp1);
704+      vnp1 = *vnpt1;
705+
706+      vdest.val[0] = vhadd_u8(vdest.val[3], vpp1.val[0]);
707+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
708+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]);
709+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
710+
711+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
712+      vst2_lane_u32(png_ptr(uint32_t,rp), vdest_val1, 0);
713+
714+      vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]);
715+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]);
716+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
717+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]);
718+
719+      vdest_val1 = png_ldr(uint32x2x2_t, &vdestN);
720+      vst2_lane_u32(png_ptr(uint32_t,np), vdest_val1, 0);
721+
722+      rp += 8;
723+      pp += 8;
724+      np += 8;
725+      count -= 8;
726    }
727+
728+   if (count == 0) {
729+      return;
730+   }
731+
732+   uint32x2_t vtmp2;
733+   uint8x8_t *vrpt2, *vppt2, *vnpt2;
734+   uint8x8_t vrp2, vpp2, vnp2;
735+   uint32x2_t *temp_pointer;
736+   uint32x2_t vdest_val2;
737+
738+   vtmp2 = vld1_u32(png_ptr(uint32_t,rp));
739+   vrpt2 = png_ptr(uint8x8_t,&vtmp2);
740+   vrp2 = *vrpt2;
741+   vtmp2 = vld1_u32(png_ptrc(uint32_t,pp));
742+   vppt2 = png_ptr(uint8x8_t,&vtmp2);
743+   vpp2 = *vppt2;
744+   vtmp2 = vld1_u32(png_ptrc(uint32_t,np));
745+   vnpt2 = png_ptr(uint8x8_t,&vtmp2);
746+   vnp2 = *vnpt2;
747+
748+   vdest.val[0] = vhadd_u8(vdest.val[1], vpp2);
749+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
750+
751+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
752+   vst1_lane_u32(png_ptr(uint32_t,rp), vdest_val2, 0);
753+
754+   vdestN.val[0] = vhadd_u8(vdestN.val[1], vdest.val[0]);
755+   vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2);
756+
757+   vdest_val2 = png_ldr(uint32x2_t, &vdestN);
758+   vst1_lane_u32(png_ptr(uint32_t,np), vdest_val2, 0);
759 }
760+#endif
761
762 static uint8x8_t
763 paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
764@@ -303,15 +860,145 @@ png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
765    vrpt = png_ptr(uint8x8x2_t,&vtmp);
766    vrp = *vrpt;
767
768+   uint8x8x2_t *vppt;
769+   uint8x8x2_t vpp;
770+   uint8x8_t vtmp1, vtmp2, vtmp3;
771+   uint32x2_t *temp_pointer;
772+
773    png_debug(1, "in png_read_filter_row_paeth3_neon");
774
775-   for (; rp < rp_stop; pp += 12)
776+   size_t tail_bytes = row_info->rowbytes % 12;
777+   png_byte last_byte = *rp_stop;
778+   png_bytep rp_stop_new = rp_stop - tail_bytes;
779+   for (; rp < rp_stop_new; pp += 12)
780    {
781-      uint8x8x2_t *vppt;
782-      uint8x8x2_t vpp;
783-      uint8x8_t vtmp1, vtmp2, vtmp3;
784-      uint32x2_t *temp_pointer;
785+      vtmp = vld1q_u8(pp);
786+      vppt = png_ptr(uint8x8x2_t,&vtmp);
787+      vpp = *vppt;
788+
789+      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
790+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
791
792+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
793+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
794+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
795+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
796+
797+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
798+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
799+      vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
800+      vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
801+
802+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
803+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
804+
805+      vtmp = vld1q_u8(rp + 12);
806+      vrpt = png_ptr(uint8x8x2_t,&vtmp);
807+      vrp = *vrpt;
808+
809+      vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
810+      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
811+
812+      vlast = vtmp2;
813+
814+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
815+      rp += 3;
816+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
817+      rp += 3;
818+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
819+      rp += 3;
820+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
821+      rp += 3;
822+   }
823+
824+   vtmp = vld1q_u8(pp);
825+   vppt = png_ptr(uint8x8x2_t,&vtmp);
826+   vpp = *vppt;
827+
828+   if (tail_bytes == 3) {
829+      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
830+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
831+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
832+   } else if (tail_bytes == 6) {
833+      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
834+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
835+
836+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
837+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
838+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
839+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
840+
841+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
842+      rp += 3;
843+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
844+   } else if (tail_bytes == 9) {
845+      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
846+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
847+
848+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
849+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
850+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
851+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
852+
853+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
854+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
855+      vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
856+      vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
857+
858+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
859+      rp += 3;
860+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
861+      rp += 3;
862+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
863+   }
864+   *rp_stop = last_byte;
865+}
866+
867+#ifdef PNG_MULTY_LINE_ENABLE
868+void
869+png_read_filter_row_paeth3_x2_neon(png_row_infop row_info, png_bytep row,
870+   png_const_bytep prev_row)
871+{
872+   png_bytep rp = row;
873+   png_const_bytep pp = prev_row;
874+   png_bytep rp_stop = row + row_info->rowbytes;
875+   png_bytep np = rp_stop + 1;
876+
877+   uint8x16_t vtmp;
878+   uint8x8x2_t *vrpt;
879+   uint8x8x2_t vrp;
880+   uint8x8_t vlast = vdup_n_u8(0);
881+   uint8x8x4_t vdest;
882+   vdest.val[3] = vdup_n_u8(0);
883+
884+   vtmp = vld1q_u8(rp);
885+   vrpt = png_ptr(uint8x8x2_t,&vtmp);
886+   vrp = *vrpt;
887+
888+   uint8x8x2_t *vppt;
889+   uint8x8x2_t vpp;
890+   uint8x8_t vtmp1, vtmp2, vtmp3;
891+   uint32x2_t *temp_pointer;
892+
893+   uint8x8x2_t *vnpt;
894+   uint8x8x2_t vnp;
895+   uint8x8_t vlastN = vdup_n_u8(0);
896+   uint8x8x4_t vdestN;
897+   vdestN.val[3] = vdup_n_u8(0);
898+
899+   vtmp = vld1q_u8(np);
900+   vnpt = png_ptr(uint8x8x2_t,&vtmp);
901+   vnp = *vnpt;
902+
903+   png_debug(1, "in png_read_filter_row_paeth3_x2_neon");
904+
905+   size_t tail_bytes = row_info->rowbytes % 12;
906+   png_byte last_byte = *rp_stop;
907+   png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1);
908+   png_bytep rp_stop_new = rp_stop - tail_bytes;
909+
910+   for (; rp < rp_stop_new; pp += 12)
911+   {
912       vtmp = vld1q_u8(pp);
913       vppt = png_ptr(uint8x8x2_t,&vtmp);
914       vpp = *vppt;
915@@ -349,15 +1036,123 @@ png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
916       rp += 3;
917       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
918       rp += 3;
919+
920+      vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN);
921+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
922+
923+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3);
924+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
925+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
926+
927+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 6);
928+      vdestN.val[2] = paeth(vdestN.val[1], vdest.val[2], vdest.val[1]);
929+      vdestN.val[2] = vadd_u8(vdestN.val[2], vtmp1);
930+
931+      vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1);
932+
933+      vtmp = vld1q_u8(np + 12);
934+      vnpt = png_ptr(uint8x8x2_t,&vtmp);
935+      vnp = *vnpt;
936+
937+      vdestN.val[3] = paeth(vdestN.val[2], vdest.val[3], vdest.val[2]);
938+      vdestN.val[3] = vadd_u8(vdestN.val[3], vtmp1);
939+
940+      vlastN = vdest.val[3];
941+
942+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0);
943+      np += 3;
944+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0);
945+      np += 3;
946+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[2]), 0);
947+      np += 3;
948+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[3]), 0);
949+      np += 3;
950    }
951+
952+   vtmp = vld1q_u8(pp);
953+   vppt = png_ptr(uint8x8x2_t,&vtmp);
954+   vpp = *vppt;
955+
956+   if (tail_bytes == 3) {
957+      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
958+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
959+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
960+
961+      vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN);
962+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
963+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0);
964+   } else if (tail_bytes == 6) {
965+      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
966+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
967+
968+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
969+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
970+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
971+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
972+
973+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
974+      rp += 3;
975+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
976+
977+      vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN);
978+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
979+
980+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3);
981+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
982+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
983+
984+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0);
985+      np += 3;
986+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0);
987+
988+   } else if (tail_bytes == 9) {
989+      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
990+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
991+
992+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
993+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
994+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
995+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
996+
997+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
998+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
999+      vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
1000+      vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
1001+
1002+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
1003+      rp += 3;
1004+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
1005+      rp += 3;
1006+      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
1007+
1008+      vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN);
1009+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1010+
1011+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3);
1012+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1013+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
1014+
1015+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 6);
1016+      vdestN.val[2] = paeth(vdestN.val[1], vdest.val[2], vdest.val[1]);
1017+      vdestN.val[2] = vadd_u8(vdestN.val[2], vtmp1);
1018+
1019+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0);
1020+      np += 3;
1021+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0);
1022+      np += 3;
1023+      vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[2]), 0);
1024+   }
1025+   *rp_stop = last_byte;
1026+   *(rp_stop + row_info->rowbytes + 1) = last_byte_next;
1027 }
1028+#endif
1029
1030 void
1031 png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
1032    png_const_bytep prev_row)
1033 {
1034    png_bytep rp = row;
1035-   png_bytep rp_stop = row + row_info->rowbytes;
1036+   int count = row_info->rowbytes;
1037    png_const_bytep pp = prev_row;
1038
1039    uint8x8_t vlast = vdup_n_u8(0);
1040@@ -366,20 +1161,129 @@ png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
1041
1042    png_debug(1, "in png_read_filter_row_paeth4_neon");
1043
1044-   for (; rp < rp_stop; rp += 16, pp += 16)
1045-   {
1046-      uint32x2x4_t vtmp;
1047-      uint8x8x4_t *vrpt, *vppt;
1048-      uint8x8x4_t vrp, vpp;
1049+   uint32x2x4_t vtmp;
1050+   uint8x8x4_t *vrpt, *vppt;
1051+   uint8x8x4_t vrp, vpp;
1052+   uint32x2x4_t vdest_val;
1053+   while (count >= 16) {
1054       uint32x2x4_t *temp_pointer;
1055-      uint32x2x4_t vdest_val;
1056+      vtmp = vld4_u32(png_ptr(uint32_t,rp));
1057+      vrpt = png_ptr(uint8x8x4_t,&vtmp);
1058+      vrp = *vrpt;
1059+      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
1060+      vppt = png_ptr(uint8x8x4_t,&vtmp);
1061+      vpp = *vppt;
1062
1063+      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
1064+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1065+      vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
1066+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
1067+      vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
1068+      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
1069+      vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
1070+      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
1071+
1072+      vlast = vpp.val[3];
1073+
1074+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
1075+      vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
1076+
1077+      rp += 16;
1078+      pp += 16;
1079+      count -= 16;
1080+   }
1081+
1082+   if (count >= 8) {
1083+      uint32x2x2_t vtmp1;
1084+      uint8x8x2_t *vrpt1, *vppt1;
1085+      uint8x8x2_t vrp1, vpp1;
1086+      uint32x2x2_t *temp_pointer;
1087+      uint32x2x2_t vdest_val1;
1088+
1089+      vtmp1 = vld2_u32(png_ptr(uint32_t,rp));
1090+      vrpt1 = png_ptr(uint8x8x2_t,&vtmp1);
1091+      vrp1 = *vrpt1;
1092+      vtmp1 = vld2_u32(png_ptrc(uint32_t,pp));
1093+      vppt1 = png_ptr(uint8x8x2_t,&vtmp1);
1094+      vpp1 = *vppt1;
1095+
1096+      vdest.val[0] = paeth(vdest.val[3], vpp1.val[0], vlast);
1097+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
1098+      vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]);
1099+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
1100+      vlast = vpp1.val[1];
1101+
1102+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
1103+      vst2_lane_u32(png_ptr(uint32_t,rp), vdest_val1, 0);
1104+      vdest.val[3] = vdest.val[1];
1105+
1106+      rp += 8;
1107+      pp += 8;
1108+      count -= 8;
1109+   }
1110+
1111+   if (count == 0) {
1112+      return;
1113+   }
1114+
1115+   uint32x2_t vtmp2;
1116+   uint8x8_t *vrpt2, *vppt2;
1117+   uint8x8_t vrp2, vpp2;
1118+   uint32x2_t *temp_pointer;
1119+   uint32x2_t vdest_val2;
1120+
1121+   vtmp2 = vld1_u32(png_ptr(uint32_t,rp));
1122+   vrpt2 = png_ptr(uint8x8_t,&vtmp2);
1123+   vrp2 = *vrpt2;
1124+   vtmp2 = vld1_u32(png_ptrc(uint32_t,pp));
1125+   vppt2 = png_ptr(uint8x8_t,&vtmp2);
1126+   vpp2 = *vppt2;
1127+
1128+   vdest.val[0] = paeth(vdest.val[3], vpp2, vlast);
1129+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
1130+
1131+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
1132+   vst1_lane_u32(png_ptr(uint32_t,rp), vdest_val2, 0);
1133+}
1134+
1135+#ifdef PNG_MULTY_LINE_ENABLE
1136+void
1137+png_read_filter_row_paeth4_x2_neon(png_row_infop row_info, png_bytep row,
1138+   png_const_bytep prev_row)
1139+{
1140+   png_bytep rp = row;
1141+   int count = row_info->rowbytes;
1142+   png_const_bytep pp = prev_row;
1143+   png_bytep np = row + row_info->rowbytes + 1;
1144+
1145+   uint8x8_t vlast = vdup_n_u8(0);
1146+   uint8x8x4_t vdest;
1147+   vdest.val[3] = vdup_n_u8(0);
1148+
1149+   png_debug(1, "in png_read_filter_row_paeth4_x2_neon");
1150+
1151+   uint32x2x4_t vtmp;
1152+   uint8x8x4_t *vrpt, *vppt;
1153+   uint8x8x4_t vrp, vpp;
1154+   uint32x2x4_t vdest_val;
1155+
1156+   uint8x8x4_t *vnpt;
1157+   uint8x8x4_t vnp;
1158+   uint8x8_t vlastN = vdup_n_u8(0);
1159+   uint8x8x4_t vdestN;
1160+   vdestN.val[3] = vdup_n_u8(0);
1161+
1162+   while (count >= 16) {
1163+      uint32x2x4_t *temp_pointer;
1164       vtmp = vld4_u32(png_ptr(uint32_t,rp));
1165       vrpt = png_ptr(uint8x8x4_t,&vtmp);
1166       vrp = *vrpt;
1167       vtmp = vld4_u32(png_ptrc(uint32_t,pp));
1168       vppt = png_ptr(uint8x8x4_t,&vtmp);
1169       vpp = *vppt;
1170+      vtmp = vld4_u32(png_ptrc(uint32_t,np));
1171+      vnpt = png_ptr(uint8x8x4_t,&vtmp);
1172+      vnp = *vnpt;
1173
1174       vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
1175       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1176@@ -394,8 +1298,107 @@ png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
1177
1178       vdest_val = png_ldr(uint32x2x4_t, &vdest);
1179       vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
1180+
1181+      vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN);
1182+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1183+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1184+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]);
1185+      vdestN.val[2] = paeth(vdestN.val[1], vdest.val[2], vdest.val[1]);
1186+      vdestN.val[2] = vadd_u8(vdestN.val[2], vnp.val[2]);
1187+      vdestN.val[3] = paeth(vdestN.val[2], vdest.val[3], vdest.val[2]);
1188+      vdestN.val[3] = vadd_u8(vdestN.val[3], vnp.val[3]);
1189+
1190+      vlastN = vdest.val[3];
1191+
1192+      vdest_val = png_ldr(uint32x2x4_t, &vdestN);
1193+      vst4_lane_u32(png_ptr(uint32_t,np), vdest_val, 0);
1194+
1195+      rp += 16;
1196+      pp += 16;
1197+      np += 16;
1198+      count -= 16;
1199    }
1200+
1201+   if (count >= 8) {
1202+      uint32x2x2_t vtmp1;
1203+      uint8x8x2_t *vrpt1, *vppt1, *vnpt1;
1204+      uint8x8x2_t vrp1, vpp1, vnp1;
1205+      uint32x2x2_t *temp_pointer;
1206+      uint32x2x2_t vdest_val1;
1207+
1208+      vtmp1 = vld2_u32(png_ptr(uint32_t,rp));
1209+      vrpt1 = png_ptr(uint8x8x2_t,&vtmp1);
1210+      vrp1 = *vrpt1;
1211+      vtmp1 = vld2_u32(png_ptrc(uint32_t,pp));
1212+      vppt1 = png_ptr(uint8x8x2_t,&vtmp1);
1213+      vpp1 = *vppt1;
1214+      vtmp1 = vld2_u32(png_ptrc(uint32_t,np));
1215+      vnpt1 = png_ptr(uint8x8x2_t,&vtmp1);
1216+      vnp1 = *vnpt1;
1217+
1218+      vdest.val[0] = paeth(vdest.val[3], vpp1.val[0], vlast);
1219+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
1220+      vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]);
1221+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
1222+
1223+      vlast = vpp1.val[1];
1224+
1225+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
1226+      vst2_lane_u32(png_ptr(uint32_t,rp), vdest_val1, 0);
1227+
1228+      vdest.val[3] = vdest.val[1];
1229+
1230+      vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN);
1231+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]);
1232+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1233+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]);
1234+
1235+      vlastN = vdest.val[1];
1236+
1237+      vdest_val1 = png_ldr(uint32x2x2_t, &vdestN);
1238+      vst2_lane_u32(png_ptr(uint32_t,np), vdest_val1, 0);
1239+
1240+      vdestN.val[3] = vdestN.val[1];
1241+
1242+      rp += 8;
1243+      pp += 8;
1244+      np += 8;
1245+      count -= 8;
1246+   }
1247+
1248+   if (count == 0) {
1249+      return;
1250+   }
1251+
1252+   uint32x2_t vtmp2;
1253+   uint8x8_t *vrpt2, *vppt2, *vnpt2;
1254+   uint8x8_t vrp2, vpp2, vnp2;
1255+   uint32x2_t *temp_pointer;
1256+   uint32x2_t vdest_val2;
1257+
1258+   vtmp2 = vld1_u32(png_ptr(uint32_t,rp));
1259+   vrpt2 = png_ptr(uint8x8_t,&vtmp2);
1260+   vrp2 = *vrpt2;
1261+   vtmp2 = vld1_u32(png_ptrc(uint32_t,pp));
1262+   vppt2 = png_ptr(uint8x8_t,&vtmp2);
1263+   vpp2 = *vppt2;
1264+   vtmp2 = vld1_u32(png_ptrc(uint32_t,np));
1265+   vnpt2 = png_ptr(uint8x8_t,&vtmp2);
1266+   vnp2 = *vnpt2;
1267+
1268+   vdest.val[0] = paeth(vdest.val[3], vpp2, vlast);
1269+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
1270+
1271+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
1272+   vst1_lane_u32(png_ptr(uint32_t,rp), vdest_val2, 0);
1273+
1274+   vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN);
1275+   vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2);
1276+
1277+   vdest_val2 = png_ldr(uint32x2_t, &vdestN);
1278+   vst1_lane_u32(png_ptr(uint32_t,np), vdest_val2, 0);
1279 }
1280+#endif
1281
1282 #endif /* PNG_ARM_NEON_OPT > 0 */
1283 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
1284diff --git a/pngpread.c b/pngpread.c
1285index e283627..f22f6a3 100644
1286--- a/pngpread.c
1287+++ b/pngpread.c
1288@@ -262,11 +262,35@@ png_push_read_chunk(png_structrp png_ptr, png_inforp info_ptr)
1289    else if (chunk_name == png_IDAT)
1290    {
1291       png_ptr->idat_size = png_ptr->push_length;
1292+
1293+#ifdef PNG_MULTY_LINE_ENABLE
1294+      // init inflate_buff
1295+      if (png_ptr->inflate_buff_max_size < png_ptr->push_length)
1296+      {
1297+         png_free(png_ptr, png_ptr->inflate_buff);
1298+         png_ptr->inflate_buff = png_voidcast(png_bytep,
1299+            png_malloc(png_ptr, png_ptr->push_length));
1300+         png_ptr->inflate_buff_size = 0;
1301+      }
1302+      png_ptr->inflate_buff_max_size = png_ptr->push_length;
1303+#endif
1304+
1305       png_ptr->process_mode = PNG_READ_IDAT_MODE;
1306       png_push_have_info(png_ptr, info_ptr);
1307-      png_ptr->zstream.avail_out =
1308-          (uInt) PNG_ROWBYTES(png_ptr->pixel_depth,
1309-          png_ptr->iwidth) + 1;
1310+#ifdef PNG_MULTY_LINE_ENABLE
1311+      if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1312+         (png_ptr->transformations & PNG_CHECK) == 0) {
1313+         int rest = png_ptr->num_rows - png_ptr->row_number;
1314+         int row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
1315+         png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
1316+             png_ptr->iwidth) + 1) * row_num; // 一次解压多行
1317+      }
1318+      else
1319+#endif
1320+      {
1321+         png_ptr->zstream.avail_out =
1322+            (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth, png_ptr->iwidth) + 1);
1323+      }
1324       png_ptr->zstream.next_out = png_ptr->row_buf;
1325       return;
1326    }
1327@@ -558,8 +582,110 @@ png_push_read_IDAT(png_structrp png_ptr)
1328       }
1329
1330       png_ptr->idat_size = png_ptr->push_length;
1331+#ifdef PNG_MULTY_LINE_ENABLE
1332+      // init inflate_buff
1333+      if (png_ptr->inflate_buff_max_size < png_ptr->push_length)
1334+      {
1335+         png_free(png_ptr, png_ptr->inflate_buff);
1336+         png_ptr->inflate_buff = png_voidcast(png_bytep,
1337+            png_malloc(png_ptr, png_ptr->push_length));
1338+         png_ptr->inflate_buff_size = 0;
1339+      }
1340+      png_ptr->inflate_buff_max_size = png_ptr->push_length;
1341+#endif
1342    }
1343
1344+#ifdef PNG_MULTY_LINE_ENABLE
1345+   if (png_ptr->idat_size != 0 && png_ptr->save_buffer_size != 0)
1346+   {
1347+      if (png_ptr->idat_size <= png_ptr->save_buffer_size)
1348+      {
1349+         png_debug2(1, "png_IDAT1: idat_size=%d save_buffer_size=%ld",
1350+            png_ptr->idat_size, png_ptr->save_buffer_size);
1351+
1352+         size_t save_size = png_ptr->idat_size;
1353+
1354+         png_calculate_crc(png_ptr, png_ptr->save_buffer_ptr, save_size);
1355+         png_process_IDAT_data(png_ptr, png_ptr->save_buffer_ptr, save_size);
1356+
1357+         png_ptr->buffer_size -= save_size;
1358+         png_ptr->save_buffer_size -= save_size;
1359+         png_ptr->save_buffer_ptr += save_size;
1360+         png_ptr->idat_size = 0;
1361+      }
1362+
1363+      else
1364+      {
1365+         png_debug2(1, "png_IDAT2: idat_size=%d save_buffer_size=%ld",
1366+            png_ptr->idat_size, png_ptr->save_buffer_size);
1367+
1368+         size_t save_size = png_ptr->save_buffer_size;
1369+
1370+         memcpy(png_ptr->inflate_buff, png_ptr->save_buffer_ptr, save_size);
1371+
1372+         png_ptr->inflate_buff_size = save_size;
1373+         png_ptr->buffer_size -= save_size;
1374+         png_ptr->save_buffer_ptr += save_size;
1375+         png_ptr->save_buffer_size = 0;
1376+      }
1377+   }
1378+
1379+   if (png_ptr->idat_size != 0 && png_ptr->current_buffer_size != 0)
1380+   {
1381+      size_t save_size = png_ptr->current_buffer_size;
1382+      if (png_ptr->idat_size > png_ptr->inflate_buff_size + save_size)
1383+      {
1384+         png_debug2(1, "png_IDAT3: inflate_buff_size=%ld current_buffer_size=%ld",
1385+            png_ptr->inflate_buff_size, save_size);
1386+
1387+         memcpy(png_ptr->inflate_buff + png_ptr->inflate_buff_size,
1388+            png_ptr->current_buffer_ptr, save_size);
1389+
1390+         png_ptr->inflate_buff_size += save_size;
1391+         png_ptr->buffer_size -= save_size;
1392+         png_ptr->current_buffer_ptr += save_size;
1393+         png_ptr->current_buffer_size = 0;
1394+      }
1395+
1396+      else
1397+      {
1398+         if (png_ptr->inflate_buff_size == 0)
1399+         {
1400+            png_debug2(1, "png_IDAT4: inflate_buff_size=%ld current_buffer_size=%ld",
1401+               png_ptr->inflate_buff_size, save_size);
1402+
1403+            save_size = png_ptr->idat_size;
1404+
1405+            png_calculate_crc(png_ptr, png_ptr->current_buffer_ptr, save_size);
1406+            png_process_IDAT_data(png_ptr, png_ptr->current_buffer_ptr, save_size);
1407+
1408+            png_ptr->buffer_size -= save_size;
1409+            png_ptr->current_buffer_size -= save_size;
1410+            png_ptr->current_buffer_ptr += save_size;
1411+            png_ptr->idat_size = 0;
1412+         }
1413+
1414+         else
1415+         {
1416+            save_size = png_ptr->idat_size - png_ptr->inflate_buff_size;
1417+            png_debug2(1, "png_IDAT5: inflate_buff_size=%ld save_size=%ld",
1418+               png_ptr->inflate_buff_size, save_size);
1419+
1420+            memcpy(png_ptr->inflate_buff + png_ptr->inflate_buff_size,
1421+               png_ptr->current_buffer_ptr, save_size);
1422+
1423+            png_ptr->inflate_buff_size = 0;
1424+            png_calculate_crc(png_ptr, png_ptr->inflate_buff, png_ptr->idat_size);
1425+            png_process_IDAT_data(png_ptr, png_ptr->inflate_buff, png_ptr->idat_size);
1426+
1427+            png_ptr->buffer_size -= save_size;
1428+            png_ptr->current_buffer_size -= save_size;
1429+            png_ptr->current_buffer_ptr += save_size;
1430+            png_ptr->idat_size = 0;
1431+         }
1432+      }
1433+   }
1434+#else
1435    if (png_ptr->idat_size != 0 && png_ptr->save_buffer_size != 0)
1436    {
1437       size_t save_size = png_ptr->save_buffer_size;
1438@@ -612,6 +738,7 @@ png_push_read_IDAT(png_structrp png_ptr)
1439       png_ptr->current_buffer_size -= save_size;
1440       png_ptr->current_buffer_ptr += save_size;
1441    }
1442+#endif
1443
1444    if (png_ptr->idat_size == 0)
1445    {
1446@@ -623,6 +750,98 @@ png_push_read_IDAT(png_structrp png_ptr)
1447    }
1448 }
1449
1450+#ifdef PNG_MULTY_LINE_ENABLE
1451+void /* PRIVATE */
1452+png_push_process_row_x2(png_structrp png_ptr, png_row_info row_info_origin)
1453+{
1454+   png_debug(1, "in png_push_process_row_x2");
1455+   /* 1.5.6: row_info moved out of png_struct to a local here. */
1456+   png_row_info row_info = row_info_origin;
1457+   png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
1458+      png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4);
1459+
1460+   /* libpng 1.5.6: the following line was copying png_ptr->rowbytes before
1461+    * 1.5.6, while the buffer really is this big in current versions of libpng
1462+    * it may not be in the future, so this was changed just to copy the
1463+    * interlaced row count:
1464+    */
1465+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1466+   if (png_ptr->transformations != 0)
1467+      png_do_read_transformations(png_ptr, &row_info);
1468+#endif
1469+
1470+   /* The transformed pixel depth should match the depth now in row_info. */
1471+   if (png_ptr->transformed_pixel_depth == 0)
1472+   {
1473+      png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1474+      if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1475+         png_error(png_ptr, "progressive row overflow");
1476+   }
1477+
1478+   png_push_have_row(png_ptr, png_ptr->row_buf + 1);
1479+   png_read_push_finish_row(png_ptr);
1480+
1481+   png_ptr->row_buf = png_ptr->row_buf + png_ptr->rowbytes + 1;
1482+
1483+   // do it again
1484+   if (png_ptr->transformations != 0)
1485+   {
1486+      memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1487+   }
1488+   else
1489+   {
1490+      png_ptr->prev_row = png_ptr->row_buf;
1491+   }
1492+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1493+   if (png_ptr->transformations != 0)
1494+      png_do_read_transformations(png_ptr, &row_info);
1495+#endif
1496+
1497+   png_push_have_row(png_ptr, png_ptr->row_buf + 1);
1498+   png_read_push_finish_row(png_ptr);
1499+}
1500+
1501+void png_push_process_multi_rows(png_structrp png_ptr, int row_num)
1502+{
1503+   png_debug(1, "in png_push_process_multi_rows");
1504+   uInt row_bytes =  png_ptr->rowbytes + 1;
1505+
1506+   png_row_info row_info;
1507+   row_info.width = png_ptr->iwidth;
1508+   row_info.color_type = png_ptr->color_type;
1509+   row_info.bit_depth = png_ptr->bit_depth;
1510+   row_info.channels = png_ptr->channels;
1511+   row_info.pixel_depth = png_ptr->pixel_depth;
1512+   row_info.rowbytes = png_ptr->rowbytes;
1513+
1514+   png_bytep temp_row = png_ptr->row_buf;
1515+   png_bytep temp_prev_row = png_ptr->prev_row;
1516+
1517+   for (int i = 0; i < row_num; i++) {
1518+      if ((png_ptr->channels == 3 || png_ptr->channels == 4) &&
1519+          i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB &&
1520+          png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST &&
1521+          png_ptr->row_buf[0] == png_ptr->row_buf[row_bytes]
1522+         )
1523+      {
1524+         png_push_process_row_x2(png_ptr, row_info);
1525+         png_ptr->row_buf = png_ptr->row_buf + row_bytes;
1526+         i++;
1527+         continue;
1528+      }
1529+      png_push_process_row(png_ptr);
1530+      png_ptr->row_buf = png_ptr->row_buf + row_bytes;
1531+   }
1532+
1533+   if (png_ptr->transformations == 0 && png_ptr->interlaced == 0)
1534+   {
1535+      png_ptr->prev_row = temp_prev_row;
1536+      memcpy(png_ptr->prev_row, png_ptr->row_buf - row_bytes, row_bytes);
1537+   }
1538+   png_ptr->row_buf = temp_row;
1539+}
1540+#endif
1541+
1542 void /* PRIVATE */
1543 png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1544     size_t buffer_length)
1545@@ -639,6 +858,16 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1546    /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */
1547    png_ptr->zstream.avail_in = (uInt)buffer_length;
1548
1549+   int row_num = 1;
1550+#ifdef PNG_MULTY_LINE_ENABLE
1551+   if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1552+       (png_ptr->transformations & PNG_CHECK) == 0)
1553+   {
1554+      int rest = png_ptr->num_rows - png_ptr->row_number;
1555+      row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
1556+   }
1557+#endif
1558+
1559    /* Keep going until the decompressed data is all processed
1560     * or the stream marked as finished.
1561     */
1562@@ -655,8 +884,16 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1563       if (!(png_ptr->zstream.avail_out > 0))
1564       {
1565          /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */
1566+#ifdef PNG_MULTY_LINE_ENABLE
1567+         if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1568+             (png_ptr->transformations & PNG_CHECK) == 0)
1569+         {
1570+            int rest = png_ptr->num_rows - png_ptr->row_number;
1571+            row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
1572+         }
1573+#endif
1574          png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
1575-             png_ptr->iwidth) + 1);
1576+             png_ptr->iwidth) + 1) * row_num;
1577
1578          png_ptr->zstream.next_out = png_ptr->row_buf;
1579       }
1580@@ -719,7 +956,11 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1581
1582          /* Do we have a complete row? */
1583          if (png_ptr->zstream.avail_out == 0)
1584+#ifdef PNG_MULTY_LINE_ENABLE
1585+            png_push_process_multi_rows(png_ptr, row_num);
1586+#else
1587             png_push_process_row(png_ptr);
1588+#endif
1589       }
1590
1591       /* And check for the end of the stream. */
1592@@ -738,6 +979,7 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1593 void /* PRIVATE */
1594 png_push_process_row(png_structrp png_ptr)
1595 {
1596+   png_debug(1, "in png_push_process_row");
1597    /* 1.5.6: row_info moved out of png_struct to a local here. */
1598    png_row_info row_info;
1599
1600@@ -762,8 +1004,16 @@ png_push_process_row(png_structrp png_ptr)
1601     * it may not be in the future, so this was changed just to copy the
1602     * interlaced row count:
1603     */
1604-   memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1605-
1606+#ifdef PNG_MULTY_LINE_ENABLE
1607+   if (png_ptr->transformations == 0 && png_ptr->interlaced == 0)
1608+   {
1609+      png_ptr->prev_row = png_ptr->row_buf;
1610+   }
1611+   else
1612+#endif
1613+   {
1614+      memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1615+   }
1616 #ifdef PNG_READ_TRANSFORMS_SUPPORTED
1617    if (png_ptr->transformations != 0)
1618       png_do_read_transformations(png_ptr, &row_info);
1619diff --git a/pngpriv.h b/pngpriv.h
1620index 2e426cf..6d60f70 100644
1621--- a/pngpriv.h
1622+++ b/pngpriv.h
1623@@ -127,7 +127,7 @@
1624     * associated assembler code, pass --enable-arm-neon=no to configure
1625     * or put -DPNG_ARM_NEON_OPT=0 in CPPFLAGS.
1626     */
1627-#  if defined(PNG_ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \
1628+#  if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \
1629    defined(PNG_ALIGNED_MEMORY_SUPPORTED)
1630 #     define PNG_ARM_NEON_OPT 2
1631 #  else
1632@@ -1304,6 +1315,18 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_neon,(png_row_infop
1633     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1634 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_neon,(png_row_infop
1635     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1636+#ifdef PNG_MULTY_LINE_ENABLE
1637+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_x2_neon,(png_row_infop
1638+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1639+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_x2_neon,(png_row_infop
1640+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1641+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_x2_neon,(png_row_infop
1642+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1643+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_x2_neon,(png_row_infop
1644+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1645+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_x2_neon,(png_row_infop
1646+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1647+#endif
1648 #endif
1649
1650 #if PNG_MIPS_MSA_OPT > 0
1651diff --git a/pngread.c b/pngread.c
1652index 5ab9224..6efc5b6 100644
1653--- a/pngread.c
1654+++ b/pngread.c
1655@@ -54,7 +54,11 @@ png_create_read_struct_2,(png_const_charp user_png_ver, png_voidp error_ptr,
1656        * required (it will be zero in a write structure.)
1657        */
1658 #     ifdef PNG_SEQUENTIAL_READ_SUPPORTED
1659+#ifdef PNG_MULTY_LINE_ENABLE
1660+         png_ptr->IDAT_read_size = PNG_INFLATE_MAX_SIZE;
1661+#else
1662          png_ptr->IDAT_read_size = PNG_IDAT_READ_SIZE;
1663+#endif
1664 #     endif
1665
1666 #     ifdef PNG_BENIGN_READ_ERRORS_SUPPORTED
1667@@ -684,6 +688,184 @@ png_read_rows(png_structrp png_ptr, png_bytepp row,
1668 #endif /* SEQUENTIAL_READ */
1669
1670 #ifdef PNG_SEQUENTIAL_READ_SUPPORTED
1671+
1672+#ifdef PNG_MULTY_LINE_ENABLE
1673+void png_read_two_rows(png_structrp png_ptr, png_bytepp rows, png_uint_32 i,
1674+                         png_row_info row_info)
1675+{
1676+   png_debug1(1, "in png_read_two_rows %d", png_ptr->row_buf[0]);
1677+   png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
1678+      png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4);
1679+
1680+#ifdef PNG_MNG_FEATURES_SUPPORTED
1681+   if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
1682+      (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
1683+   {
1684+      /* Intrapixel differencing */
1685+      png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
1686+   }
1687+#endif
1688+
1689+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1690+   if (png_ptr->transformations
1691+#       ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
1692+         || png_ptr->num_palette_max >= 0
1693+#       endif
1694+      )
1695+      png_do_read_transformations(png_ptr, &row_info);
1696+#endif
1697+
1698+   /* The transformed pixel depth should match the depth now in row_info. */
1699+   if (png_ptr->transformed_pixel_depth == 0)
1700+   {
1701+      png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1702+      if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1703+         png_error(png_ptr, "sequential row overflow");
1704+   }
1705+
1706+   else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
1707+      png_error(png_ptr, "internal sequential row size calculation error");
1708+
1709+
1710+   if (rows[i] != NULL)
1711+      png_combine_row(png_ptr, rows[i], -1/*ignored*/);
1712+
1713+   png_read_finish_row(png_ptr);
1714+
1715+   if (png_ptr->read_row_fn != NULL)
1716+      (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
1717+
1718+   png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1;
1719+
1720+   // do again next line
1721+   memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1722+
1723+#ifdef PNG_MNG_FEATURES_SUPPORTED
1724+   if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
1725+      (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
1726+   {
1727+      /* Intrapixel differencing */
1728+      png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
1729+   }
1730+#endif
1731+
1732+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1733+   if (png_ptr->transformations
1734+#       ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
1735+         || png_ptr->num_palette_max >= 0
1736+#       endif
1737+      )
1738+      png_do_read_transformations(png_ptr, &row_info);
1739+#endif
1740+
1741+   /* The transformed pixel depth should match the depth now in row_info. */
1742+   if (png_ptr->transformed_pixel_depth == 0)
1743+   {
1744+      png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1745+      if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1746+         png_error(png_ptr, "sequential row overflow");
1747+   }
1748+
1749+   else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
1750+      png_error(png_ptr, "internal sequential row size calculation error");
1751+
1752+
1753+   if (rows[i+1] != NULL)
1754+      png_combine_row(png_ptr, rows[i+1], -1/*ignored*/);
1755+
1756+   png_read_finish_row(png_ptr);
1757+
1758+   if (png_ptr->read_row_fn != NULL)
1759+      (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
1760+
1761+   png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1;
1762+
1763+}
1764+
1765+void png_read_muilty_rows(png_structrp png_ptr, png_bytepp rows, png_uint_32 row_num,
1766+                         png_row_info row_info_origin)
1767+{
1768+   if (png_ptr == NULL)
1769+      return;
1770+
1771+   png_debug2(1, "in png_read_muilty_rows (row %lu, pass %d)",
1772+       (unsigned long)png_ptr->row_number, png_ptr->pass);
1773+
1774+   if ((png_ptr->mode & PNG_HAVE_IDAT) == 0)
1775+         png_error(png_ptr, "Invalid attempt to read row data");
1776+
1777+   /* Fill the row with IDAT data: */
1778+   uInt row_bytes =  row_info_origin.rowbytes;
1779+   png_ptr->row_buf[0]=255; /* to force error if no data was found */
1780+   png_read_IDAT_data(png_ptr, png_ptr->row_buf, (row_bytes + 1) * row_num);
1781+   png_bytep temp_row = png_ptr->row_buf;
1782+
1783+   for (png_uint_32 i = 0; i < row_num; i++) {
1784+      png_row_info row_info = row_info_origin;
1785+      if ((row_info_origin.channels == 3 || row_info_origin.channels == 4) &&
1786+          i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB &&
1787+          png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST &&
1788+          png_ptr->row_buf[0] == png_ptr->row_buf[row_info_origin.rowbytes + 1]
1789+         ) {
1790+         png_read_two_rows(png_ptr, rows, i, row_info);
1791+         i++;
1792+         continue;
1793+      }
1794+      if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE)
1795+      {
1796+         if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST)
1797+            png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
1798+               png_ptr->prev_row + 1, png_ptr->row_buf[0]);
1799+         else
1800+            png_debug1(1, "bad adaptive filter value %d", png_ptr->row_buf[0]);
1801+      }
1802+
1803+      memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info_origin.rowbytes + 1);
1804+
1805+#ifdef PNG_MNG_FEATURES_SUPPORTED
1806+      if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
1807+         (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
1808+      {
1809+         /* Intrapixel differencing */
1810+         png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
1811+      }
1812+#endif
1813+
1814+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1815+      if (png_ptr->transformations
1816+#        ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
1817+            || png_ptr->num_palette_max >= 0
1818+#        endif
1819+         )
1820+         png_do_read_transformations(png_ptr, &row_info);
1821+#endif
1822+
1823+      /* The transformed pixel depth should match the depth now in row_info. */
1824+      if (png_ptr->transformed_pixel_depth == 0)
1825+      {
1826+         png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1827+         if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1828+            png_error(png_ptr, "sequential row overflow");
1829+      }
1830+
1831+      else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
1832+         png_error(png_ptr, "internal sequential row size calculation error");
1833+
1834+
1835+      if (rows[i] != NULL)
1836+         png_combine_row(png_ptr, rows[i], -1/*ignored*/);
1837+
1838+      png_read_finish_row(png_ptr);
1839+
1840+      if (png_ptr->read_row_fn != NULL)
1841+         (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
1842+
1843+      png_ptr->row_buf = png_ptr->row_buf + row_bytes + 1;
1844+   }
1845+   png_ptr->row_buf = temp_row;
1846+}
1847+#endif
1848+
1849 /* Read the entire image.  If the image has an alpha channel or a tRNS
1850  * chunk, and you have called png_handle_alpha()[*], you will need to
1851  * initialize the image to the current image that PNG will be overlaying.
1852@@ -745,13 +927,82 @@ png_read_image(png_structrp png_ptr, png_bytepp image)
1853
1854    image_height=png_ptr->height;
1855
1856-   for (j = 0; j < pass; j++)
1857-   {
1858+#ifdef PNG_MULTY_LINE_ENABLE
1859+   if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1860+       (png_ptr->transformations & PNG_CHECK) == 0) {
1861+      if ((png_ptr->flags & PNG_FLAG_ROW_INIT) == 0)
1862+         png_read_start_row(png_ptr);
1863+
1864+#ifdef PNG_WARNINGS_SUPPORTED
1865+      /* Check for transforms that have been set but were defined out */
1866+#if defined(PNG_WRITE_INVERT_SUPPORTED) && !defined(PNG_READ_INVERT_SUPPORTED)
1867+      if ((png_ptr->transformations & PNG_INVERT_MONO) != 0)
1868+         png_warning(png_ptr, "PNG_READ_INVERT_SUPPORTED is not defined");
1869+#endif
1870+
1871+#if defined(PNG_WRITE_FILLER_SUPPORTED) && !defined(PNG_READ_FILLER_SUPPORTED)
1872+      if ((png_ptr->transformations & PNG_FILLER) != 0)
1873+         png_warning(png_ptr, "PNG_READ_FILLER_SUPPORTED is not defined");
1874+#endif
1875+
1876+#if defined(PNG_WRITE_PACKSWAP_SUPPORTED) && \
1877+    !defined(PNG_READ_PACKSWAP_SUPPORTED)
1878+      if ((png_ptr->transformations & PNG_PACKSWAP) != 0)
1879+         png_warning(png_ptr, "PNG_READ_PACKSWAP_SUPPORTED is not defined");
1880+#endif
1881+
1882+#if defined(PNG_WRITE_PACK_SUPPORTED) && !defined(PNG_READ_PACK_SUPPORTED)
1883+      if ((png_ptr->transformations & PNG_PACK) != 0)
1884+         png_warning(png_ptr, "PNG_READ_PACK_SUPPORTED is not defined");
1885+#endif
1886+
1887+#if defined(PNG_WRITE_SHIFT_SUPPORTED) && !defined(PNG_READ_SHIFT_SUPPORTED)
1888+      if ((png_ptr->transformations & PNG_SHIFT) != 0)
1889+         png_warning(png_ptr, "PNG_READ_SHIFT_SUPPORTED is not defined");
1890+#endif
1891+
1892+#if defined(PNG_WRITE_BGR_SUPPORTED) && !defined(PNG_READ_BGR_SUPPORTED)
1893+      if ((png_ptr->transformations & PNG_BGR) != 0)
1894+         png_warning(png_ptr, "PNG_READ_BGR_SUPPORTED is not defined");
1895+#endif
1896+
1897+#if defined(PNG_WRITE_SWAP_SUPPORTED) && !defined(PNG_READ_SWAP_SUPPORTED)
1898+      if ((png_ptr->transformations & PNG_SWAP_BYTES) != 0)
1899+         png_warning(png_ptr, "PNG_READ_SWAP_SUPPORTED is not defined");
1900+#endif
1901+#endif /* WARNINGS */
1902+
1903+      png_row_info row_info;
1904+      row_info.width = png_ptr->iwidth;
1905+      row_info.color_type = png_ptr->color_type;
1906+      row_info.bit_depth = png_ptr->bit_depth;
1907+      row_info.channels = png_ptr->channels;
1908+      row_info.pixel_depth = png_ptr->pixel_depth;
1909+      row_info.rowbytes = png_ptr->rowbytes;
1910+
1911       rp = image;
1912-      for (i = 0; i < image_height; i++)
1913+      int row_num = PNG_INFLATE_ROWS;
1914+      for (i = 0; i < image_height; i += PNG_INFLATE_ROWS)
1915       {
1916-         png_read_row(png_ptr, *rp, NULL);
1917-         rp++;
1918+         if (image_height - i < PNG_INFLATE_ROWS)
1919+         {
1920+            row_num = image_height - i;
1921+         }
1922+         png_read_muilty_rows(png_ptr, rp, row_num, row_info);
1923+         rp += row_num;
1924+      }
1925+   }
1926+   else
1927+#endif
1928+   {
1929+      for (j = 0; j < pass; j++)
1930+      {
1931+         rp = image;
1932+         for (i = 0; i < image_height; i++)
1933+         {
1934+            png_read_row(png_ptr, *rp, NULL);
1935+            rp++;
1936+         }
1937       }
1938    }
1939 }
1940@@ -1000,6 +1251,10 @@ png_read_destroy(png_structrp png_ptr)
1941    png_ptr->riffled_palette = NULL;
1942 #endif
1943
1944+#ifdef PNG_MULTY_LINE_ENABLE
1945+   png_free(png_ptr, png_ptr->inflate_buff);
1946+   png_ptr->inflate_buff = NULL;
1947+#endif
1948    /* NOTE: the 'setjmp' buffer may still be allocated and the memory and error
1949     * callbacks are still set at this point.  They are required to complete the
1950     * destruction of the png_struct itself.
1951diff --git a/pngrutil.c b/pngrutil.c
1952index ca060dd..c3c177c 100644
1953--- a/pngrutil.c
1954+++ b/pngrutil.c
1955@@ -4136,7 +4136,7 @@ png_read_filter_row(png_structrp pp, png_row_infop row_info, png_bytep row,
1956     * PNG_FILTER_OPTIMIZATIONS to a function that overrides the generic
1957     * implementations.  See png_init_filter_functions above.
1958     */
1959-   if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST)
1960+   if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST_X2)
1961    {
1962       if (pp->read_filter[0] == NULL)
1963          png_init_filter_functions(pp);
1964@@ -4604,11 +4604,24 @@ defined(PNG_USER_TRANSFORM_PTR_SUPPORTED)
1965       png_free(png_ptr, png_ptr->big_prev_row);
1966
1967       if (png_ptr->interlaced != 0)
1968-         png_ptr->big_row_buf = (png_bytep)png_calloc(png_ptr,
1969-             row_bytes + 48);
1970+      {
1971+         png_ptr->big_row_buf = (png_bytep)png_calloc(png_ptr, row_bytes + 48);
1972+      }
1973
1974       else
1975-         png_ptr->big_row_buf = (png_bytep)png_malloc(png_ptr, row_bytes + 48);
1976+      {
1977+         png_uint_32 row_num = 1;
1978+#ifdef PNG_MULTY_LINE_ENABLE
1979+         if (png_ptr->bit_depth == 8 &&
1980+             (png_ptr->transformations & PNG_CHECK) == 0)
1981+         {
1982+            row_num = png_ptr->height < PNG_INFLATE_ROWS ?
1983+               png_ptr->height : PNG_INFLATE_ROWS;
1984+         }
1985+#endif
1986+         png_ptr->big_row_buf =
1987+            (png_bytep)png_malloc(png_ptr, row_bytes * row_num + 48);
1988+      }
1989
1990       png_ptr->big_prev_row = (png_bytep)png_malloc(png_ptr, row_bytes + 48);
1991
1992diff --git a/pngstruct.h b/pngstruct.h
1993index e591d94..1875c7a 100644
1994--- a/pngstruct.h
1995+++ b/pngstruct.h
1996@@ -140,6 +140,16 @@ typedef const png_colorspace * PNG_RESTRICT png_const_colorspacerp;
1997 #define PNG_COLORSPACE_CANCEL(flags)        (0xffff ^ (flags))
1998 #endif /* COLORSPACE || GAMMA */
1999
2000+#ifdef PNG_MULTY_LINE_ENABLE
2001+/* General flags for the 2 line filter */
2002+#define PNG_FILTER_VALUE_UP_X2      6 // PNG_FILTER_VALUE_UP + 4
2003+#define PNG_FILTER_VALUE_AVG_X2     7 // PNG_FILTER_VALUE_AVG + 4
2004+#define PNG_FILTER_VALUE_PAETH_X2   8 // PNG_FILTER_VALUE_PAETH + 4
2005+#define PNG_FILTER_VALUE_LAST_X2    9 // PNG_FILTER_VALUE_LAST + 4
2006+#else
2007+#define PNG_FILTER_VALUE_LAST_X2    5 // PNG_FILTER_VALUE_LAST
2008+#endif
2009+
2010 struct png_struct_def
2011 {
2012 #ifdef PNG_SETJMP_SUPPORTED
2013@@ -467,7 +477,7 @@ struct png_struct_def
2014    png_bytep big_prev_row;
2015
2016 /* New member added in libpng-1.5.7 */
2017-   void (*read_filter[PNG_FILTER_VALUE_LAST-1])(png_row_infop row_info,
2018+   void (*read_filter[PNG_FILTER_VALUE_LAST_X2-1])(png_row_infop row_info,
2019       png_bytep row, png_const_bytep prev_row);
2020
2021 #ifdef PNG_READ_SUPPORTED
2022@@ -475,5 +485,11 @@ struct png_struct_def
2023    png_colorspace   colorspace;
2024 #endif
2025 #endif
2026+
2027+#ifdef PNG_MULTY_LINE_ENABLE
2028+   png_bytep inflate_buff;
2029+   png_uint_32 inflate_buff_max_size;
2030+   png_uint_32 inflate_buff_size;
2031+#endif
2032 };
2033 #endif /* PNGSTRUCT_H */
2034diff --git a/pngtrans.c b/pngtrans.c
2035index 1100f46..4860e20 100644
2036--- a/pngtrans.c
2037+++ b/pngtrans.c
2038@@ -13,6 +13,17 @@
2039
2040 #include "pngpriv.h"
2041
2042+#ifdef PNG_ARM_NEON_IMPLEMENTATION
2043+#  if PNG_ARM_NEON_IMPLEMENTATION == 1
2044+#    define PNG_ARM_NEON_INTRINSICS_AVAILABLE
2045+#    if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
2046+#      include <arm64_neon.h>
2047+#    else
2048+#      include <arm_neon.h>
2049+#    endif
2050+#  endif
2051+#endif
2052+
2053 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED)
2054
2055 #if defined(PNG_READ_BGR_SUPPORTED) || defined(PNG_WRITE_BGR_SUPPORTED)
2056@@ -269,13 +280,19 @@ png_do_invert(png_row_infop row_info, png_bytep row)
2057    if (row_info->color_type == PNG_COLOR_TYPE_GRAY)
2058    {
2059       png_bytep rp = row;
2060-      size_t i;
2061-      size_t istop = row_info->rowbytes;
2062-
2063-      for (i = 0; i < istop; i++)
2064+      png_bytep rp_stop = row + row_info->rowbytes;
2065+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE
2066+      png_bytep rp_stop_neon = row + row_info->rowbytes - 16;
2067+      for (; rp < rp_stop_neon; rp += 16)
2068+      {
2069+         uint8x16_t gray = vld1q_u8(rp);
2070+         gray = ~gray;
2071+         vst1q_u8(rp, gray);
2072+      }
2073+#endif
2074+      for (; rp < rp_stop; rp++)
2075       {
2076          *rp = (png_byte)(~(*rp));
2077-         rp++;
2078       }
2079    }
2080
2081@@ -283,10 +300,17 @@ png_do_invert(png_row_infop row_info, png_bytep row)
2082       row_info->bit_depth == 8)
2083    {
2084       png_bytep rp = row;
2085-      size_t i;
2086-      size_t istop = row_info->rowbytes;
2087-
2088-      for (i = 0; i < istop; i += 2)
2089+      png_bytep rp_stop = row + row_info->rowbytes;
2090+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE
2091+      png_bytep rp_stop_neon = row + row_info->rowbytes - 32;
2092+      for (; rp < rp_stop_neon; rp += 32)
2093+      {
2094+         uint8x16x2_t gray_alpha = vld2q_u8(rp);
2095+         gray_alpha.val[0] = ~gray_alpha.val[0];
2096+         vst2q_u8(rp, gray_alpha);
2097+      }
2098+#endif
2099+      for (; rp < rp_stop; rp += 2)
2100       {
2101          *rp = (png_byte)(~(*rp));
2102          rp += 2;
2103@@ -298,10 +322,18 @@ png_do_invert(png_row_infop row_info, png_bytep row)
2104       row_info->bit_depth == 16)
2105    {
2106       png_bytep rp = row;
2107-      size_t i;
2108-      size_t istop = row_info->rowbytes;
2109-
2110-      for (i = 0; i < istop; i += 4)
2111+      png_bytep rp_stop = row + row_info->rowbytes;
2112+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE
2113+      png_bytep rp_stop_neon = row + row_info->rowbytes - 64;
2114+      for (; rp < rp_stop_neon; rp += 64)
2115+      {
2116+         uint8x16x4_t gray_alpha = vld4q_u8(rp);
2117+         gray_alpha.val[0] = ~gray_alpha.val[0];
2118+         gray_alpha.val[1] = ~gray_alpha.val[1];
2119+         vst4q_u8(rp, gray_alpha);
2120+      }
2121+#endif
2122+      for (; rp < rp_stop; rp += 4)
2123       {
2124          *rp = (png_byte)(~(*rp));
2125          *(rp + 1) = (png_byte)(~(*(rp + 1)));
2126@@ -323,10 +355,20 @@ png_do_swap(png_row_infop row_info, png_bytep row)
2127    if (row_info->bit_depth == 16)
2128    {
2129       png_bytep rp = row;
2130-      png_uint_32 i;
2131-      png_uint_32 istop= row_info->width * row_info->channels;
2132+      png_bytep rp_stop = row + row_info->rowbytes;
2133+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE
2134+      png_bytep rp_stop_neon = row + row_info->rowbytes - 32;
2135+      for (; rp < rp_stop_neon; rp += 32)
2136+      {
2137+         uint8x16x2_t gray = vld2q_u8(rp);
2138+         uint8x16_t tmp = gray.val[0];
2139+         gray.val[0] = gray.val[1];
2140+         gray.val[1] = tmp;
2141+         vst2q_u8(rp, gray);
2142+      }
2143+#endif
2144
2145-      for (i = 0; i < istop; i++, rp += 2)
2146+      for (; rp < rp_stop; rp += 2)
2147       {
2148 #ifdef PNG_BUILTIN_BSWAP16_SUPPORTED
2149          /* Feature added to libpng-1.6.11 for testing purposes, not
2150@@ -622,15 +664,25 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2151
2152    if ((row_info->color_type & PNG_COLOR_MASK_COLOR) != 0)
2153    {
2154-      png_uint_32 row_width = row_info->width;
2155       if (row_info->bit_depth == 8)
2156       {
2157          if (row_info->color_type == PNG_COLOR_TYPE_RGB)
2158          {
2159-            png_bytep rp;
2160-            png_uint_32 i;
2161+            png_bytep rp = row;
2162+            png_bytep rp_stop = row + row_info->rowbytes;
2163
2164-            for (i = 0, rp = row; i < row_width; i++, rp += 3)
2165+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE
2166+            png_bytep rp_stop_neon = row + row_info->rowbytes - 48;
2167+            for (; rp < rp_stop_neon; rp += 48)
2168+            {
2169+               uint8x16x3_t bgr = vld3q_u8(rp);
2170+               uint8x16_t tmp = bgr.val[2];
2171+               bgr.val[2] = bgr.val[0];
2172+               bgr.val[0] = tmp;
2173+               vst3q_u8(rp, bgr);
2174+            }
2175+#endif
2176+            for (; rp < rp_stop; rp += 3)
2177             {
2178                png_byte save = *rp;
2179                *rp = *(rp + 2);
2180@@ -640,10 +692,21 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2181
2182          else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA)
2183          {
2184-            png_bytep rp;
2185-            png_uint_32 i;
2186+            png_bytep rp = row;
2187+            png_bytep rp_stop = row + row_info->rowbytes;
2188
2189-            for (i = 0, rp = row; i < row_width; i++, rp += 4)
2190+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE
2191+            png_bytep rp_stop_neon = row + row_info->rowbytes - 64;
2192+            for (; rp < rp_stop_neon; rp += 64)
2193+            {
2194+               uint8x16x4_t bgra = vld4q_u8(rp);
2195+               uint8x16_t tmp = bgra.val[2];
2196+               bgra.val[2] = bgra.val[0];
2197+               bgra.val[0] = tmp;
2198+               vst4q_u8(rp, bgra);
2199+            }
2200+#endif
2201+            for (; rp < rp_stop; rp += 4)
2202             {
2203                png_byte save = *rp;
2204                *rp = *(rp + 2);
2205@@ -657,10 +720,21 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2206       {
2207          if (row_info->color_type == PNG_COLOR_TYPE_RGB)
2208          {
2209-            png_bytep rp;
2210-            png_uint_32 i;
2211+            png_bytep rp = row;
2212+            png_bytep rp_stop = row + row_info->rowbytes;
2213
2214-            for (i = 0, rp = row; i < row_width; i++, rp += 6)
2215+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE
2216+            png_bytep rp_stop_neon = row + row_info->rowbytes - 48;
2217+            for (; rp < rp_stop_neon; rp += 48)
2218+            {
2219+               uint16x8x3_t bgr = vld3q_u16((unsigned short *)rp);
2220+               uint16x8_t tmp = bgr.val[2];
2221+               bgr.val[2] = bgr.val[0];
2222+               bgr.val[0] = tmp;
2223+               vst3q_u16((unsigned short *)rp, bgr);
2224+            }
2225+#endif
2226+            for (; rp < rp_stop; rp += 6)
2227             {
2228                png_byte save = *rp;
2229                *rp = *(rp + 4);
2230@@ -673,10 +747,21 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2231
2232          else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA)
2233          {
2234-            png_bytep rp;
2235-            png_uint_32 i;
2236+            png_bytep rp = row;
2237+            png_bytep rp_stop = row + row_info->rowbytes;
2238
2239-            for (i = 0, rp = row; i < row_width; i++, rp += 8)
2240+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE
2241+            png_bytep rp_stop_neon = row + row_info->rowbytes - 48;
2242+            for (; rp < rp_stop_neon; rp += 64)
2243+            {
2244+               uint16x8x4_t bgra = vld4q_u16((unsigned short *)rp);
2245+               uint16x8_t tmp = bgra.val[2];
2246+               bgra.val[2] = bgra.val[0];
2247+               bgra.val[0] = tmp;
2248+               vst4q_u16((unsigned short *)rp, bgra);
2249+            }
2250+#endif
2251+            for (; rp < rp_stop; rp += 8)
2252             {
2253                png_byte save = *rp;
2254                *rp = *(rp + 4);
2255