1diff --git a/arm/arm_init.c b/arm/arm_init.c 2index ab22525..af40b2b 100644 3--- a/arm/arm_init.c 4+++ b/arm/arm_init.c 5@@ -115,13 +115,21 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp) 6 * initialization function.) 7 */ 8 pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon; 9- 10+#ifdef PNG_MULTY_LINE_ENABLE 11+ pp->read_filter[PNG_FILTER_VALUE_UP_X2-1] = png_read_filter_row_up_x2_neon; 12+#endif 13 if (bpp == 3) 14 { 15 pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon; 16 pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon; 17 pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = 18 png_read_filter_row_paeth3_neon; 19+#ifdef PNG_MULTY_LINE_ENABLE 20+ pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] = 21+ png_read_filter_row_avg3_x2_neon; 22+ pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] = 23+ png_read_filter_row_paeth3_x2_neon; 24+#endif 25 } 26 27 else if (bpp == 4) 28@@ -130,6 +130,12 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp) 29 pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon; 30 pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = 31 png_read_filter_row_paeth4_neon; 32+#ifdef PNG_MULTY_LINE_ENABLE 33+ pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] = 34+ png_read_filter_row_avg4_x2_neon; 35+ pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] = 36+ png_read_filter_row_paeth4_x2_neon; 37+#endif 38 } 39 } 40 #endif /* PNG_ARM_NEON_OPT > 0 */ 41diff --git a/arm/filter_neon_intrinsics.c b/arm/filter_neon_intrinsics.c 42index 4466d48..f11286a 100644 43--- a/arm/filter_neon_intrinsics.c 44+++ b/arm/filter_neon_intrinsics.c 45@@ -52,21 +52,90 @@ png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, 46 png_const_bytep prev_row) 47 { 48 png_bytep rp = row; 49- png_bytep rp_stop = row + row_info->rowbytes; 50 png_const_bytep pp = prev_row; 51+ int count = row_info->rowbytes; 52 53 png_debug(1, "in png_read_filter_row_up_neon"); 54 55- for (; rp < rp_stop; rp += 16, pp += 16) 56- { 57- uint8x16_t qrp, qpp; 58+ uint8x16_t qrp, qpp; 59+ while (count >= 16) { 60+ qrp = vld1q_u8(rp); 61+ qpp = vld1q_u8(pp); 62+ qrp = vaddq_u8(qrp, qpp); 63+ vst1q_u8(rp, qrp); 64+ rp += 16; 65+ pp += 16; 66+ count -= 16; 67+ } 68+ 69+ if (count >= 8) { 70+ uint8x8_t qrp1, qpp1; 71+ qrp1 = vld1_u8(rp); 72+ qpp1 = vld1_u8(pp); 73+ qrp1 = vadd_u8(qrp1, qpp1); 74+ vst1_u8(rp, qrp1); 75+ rp += 8; 76+ pp += 8; 77+ count -= 8; 78+ } 79+ 80+ int i = 0; 81+ for (i = 0; i < count; i++) { 82+ *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 83+ rp++; 84+ } 85+} 86+ 87+#ifdef PNG_MULTY_LINE_ENABLE 88+void 89+png_read_filter_row_up_x2_neon(png_row_infop row_info, png_bytep row, 90+ png_const_bytep prev_row) 91+{ 92+ png_bytep rp = row; 93+ png_const_bytep pp = prev_row; 94+ int count = row_info->rowbytes; 95+ png_bytep np = row + row_info->rowbytes + 1; 96+ 97+ png_debug(1, "in png_read_filter_row_up_x2_neon"); 98 99+ uint8x16_t qrp, qpp, qnp; 100+ while (count >= 16) { 101 qrp = vld1q_u8(rp); 102 qpp = vld1q_u8(pp); 103+ qnp = vld1q_u8(np); 104 qrp = vaddq_u8(qrp, qpp); 105+ qnp = vaddq_u8(qnp, qrp); 106 vst1q_u8(rp, qrp); 107+ vst1q_u8(np, qnp); 108+ rp += 16; 109+ pp += 16; 110+ np += 16; 111+ count -= 16; 112+ } 113+ 114+ if (count >= 8) { 115+ uint8x8_t qrp1, qpp1, qnp1; 116+ qrp1 = vld1_u8(rp); 117+ qpp1 = vld1_u8(pp); 118+ qnp1 = vld1_u8(np); 119+ qrp1 = vadd_u8(qrp1, qpp1); 120+ qnp1 = vadd_u8(qnp1, qrp1); 121+ vst1_u8(rp, qrp1); 122+ vst1_u8(np, qnp1); 123+ rp += 8; 124+ pp += 8; 125+ np += 8; 126+ count -= 8; 127+ } 128+ 129+ int i = 0; 130+ for (i = 0; i < count; i++) { 131+ *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 132+ *np = (png_byte)(((int)(*np) + (int)(*rp++)) & 0xff); 133+ np++; 134 } 135 } 136+#endif 137 138 void 139 png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, 140@@ -82,13 +151,16 @@ png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, 141 uint8x8x4_t vdest; 142 vdest.val[3] = vdup_n_u8(0); 143 144+ uint8x8_t vtmp1, vtmp2; 145+ uint32x2_t *temp_pointer; 146+ 147 png_debug(1, "in png_read_filter_row_sub3_neon"); 148 149- for (; rp < rp_stop;) 150+ size_t tail_bytes = row_info->rowbytes % 12; 151+ png_byte last_byte = *rp_stop; 152+ png_bytep rp_stop_new = rp_stop - tail_bytes; 153+ for (; rp < rp_stop_new;) 154 { 155- uint8x8_t vtmp1, vtmp2; 156- uint32x2_t *temp_pointer; 157- 158 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 159 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 160 vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6); 161@@ -112,6 +184,32 @@ png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, 162 rp += 3; 163 } 164 165+ if (tail_bytes == 3) { 166+ vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 167+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 168+ } else if (tail_bytes == 6) { 169+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 170+ vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 171+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 172+ 173+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 174+ rp += 3; 175+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 176+ } else if (tail_bytes == 9) { 177+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 178+ vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 179+ vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6); 180+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 181+ vdest.val[2] = vadd_u8(vdest.val[1], vtmp2); 182+ 183+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 184+ rp += 3; 185+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 186+ rp += 3; 187+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 188+ } 189+ *rp_stop = last_byte; 190+ 191 PNG_UNUSED(prev_row) 192 } 193 194@@ -120,20 +218,22 @@ png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, 195 png_const_bytep prev_row) 196 { 197 png_bytep rp = row; 198- png_bytep rp_stop = row + row_info->rowbytes; 199+ int count = row_info->rowbytes; 200 201 uint8x8x4_t vdest; 202 vdest.val[3] = vdup_n_u8(0); 203 204 png_debug(1, "in png_read_filter_row_sub4_neon"); 205 206- for (; rp < rp_stop; rp += 16) 207- { 208- uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp)); 209- uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp); 210- uint8x8x4_t vrp = *vrpt; 211+ uint32x2x4_t vtmp; 212+ uint8x8x4_t *vrpt; 213+ uint8x8x4_t vrp; 214+ uint32x2x4_t vdest_val; 215+ while (count >= 16) { 216 uint32x2x4_t *temp_pointer; 217- uint32x2x4_t vdest_val; 218+ vtmp = vld4_u32(png_ptr(uint32_t,rp)); 219+ vrpt = png_ptr(uint8x8x4_t,&vtmp); 220+ vrp = *vrpt; 221 222 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 223 vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]); 224@@ -142,8 +242,42 @@ png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, 225 226 vdest_val = png_ldr(uint32x2x4_t, &vdest); 227 vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0); 228+ 229+ rp += 16; 230+ count -= 16; 231 } 232 233+ if (count >= 8) { 234+ uint32x2x2_t vtmp1 = vld2_u32(png_ptr(uint32_t,rp)); 235+ uint8x8x2_t *vrpt1 = png_ptr(uint8x8x2_t,&vtmp1); 236+ uint8x8x2_t vrp1 = *vrpt1; 237+ uint32x2x2_t *temp_pointer; 238+ uint32x2x2_t vdest_val1; 239+ 240+ vdest.val[0] = vadd_u8(vdest.val[3], vrp1.val[0]); 241+ vdest.val[1] = vadd_u8(vdest.val[0], vrp1.val[1]); 242+ 243+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 244+ vst2_lane_u32(png_ptr(uint32_t,rp), vdest_val1, 0); 245+ 246+ rp += 8; 247+ count -= 8; 248+ } 249+ 250+ if (count == 0) { 251+ return; 252+ } 253+ 254+ uint32x2_t vtmp2 = vld1_u32(png_ptr(uint32_t,rp)); 255+ uint8x8_t *vrpt2 = png_ptr(uint8x8_t,&vtmp2); 256+ uint8x8_t vrp2 = *vrpt2; 257+ uint32x2_t *temp_pointer; 258+ uint32x2_t vdest_val2; 259+ 260+ vdest.val[0] = vadd_u8(vdest.val[1], vrp2); 261+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 262+ vst1_lane_u32(png_ptr(uint32_t,rp), vdest_val2, 0); 263+ 264 PNG_UNUSED(prev_row) 265 } 266 267@@ -167,15 +301,140 @@ png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, 268 269 png_debug(1, "in png_read_filter_row_avg3_neon"); 270 271- for (; rp < rp_stop; pp += 12) 272+ uint8x8_t vtmp1, vtmp2, vtmp3; 273+ uint8x8x2_t *vppt; 274+ uint8x8x2_t vpp; 275+ uint32x2_t *temp_pointer; 276+ 277+ size_t tail_bytes = row_info->rowbytes % 12; 278+ png_byte last_byte = *rp_stop; 279+ png_bytep rp_stop_new = rp_stop - tail_bytes; 280+ for (; rp < rp_stop_new; pp += 12) 281 { 282- uint8x8_t vtmp1, vtmp2, vtmp3; 283+ vtmp = vld1q_u8(pp); 284+ vppt = png_ptr(uint8x8x2_t,&vtmp); 285+ vpp = *vppt; 286+ 287+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 288+ vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 289+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 290+ 291+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 292+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6); 293+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 294+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 295+ 296+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6); 297+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 298+ 299+ vtmp = vld1q_u8(rp + 12); 300+ vrpt = png_ptr(uint8x8x2_t,&vtmp); 301+ vrp = *vrpt; 302+ 303+ vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2); 304+ vdest.val[2] = vadd_u8(vdest.val[2], vtmp3); 305+ 306+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 307+ 308+ vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2); 309+ vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); 310+ 311+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 312+ rp += 3; 313+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 314+ rp += 3; 315+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 316+ rp += 3; 317+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 318+ rp += 3; 319+ } 320+ 321+ vtmp = vld1q_u8(pp); 322+ vppt = png_ptr(uint8x8x2_t,&vtmp); 323+ vpp = *vppt; 324+ 325+ if (tail_bytes == 3) { 326+ vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 327+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 328+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 329+ } else if (tail_bytes == 6) { 330+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 331+ vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 332+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 333+ 334+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 335+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 336+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 337+ 338+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 339+ rp += 3; 340+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 341+ } else if (tail_bytes == 9) { 342+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 343+ vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 344+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 345+ 346+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 347+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6); 348+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 349+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 350+ 351+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6); 352+ 353+ vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2); 354+ vdest.val[2] = vadd_u8(vdest.val[2], vtmp3); 355+ 356+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 357+ rp += 3; 358+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 359+ rp += 3; 360+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 361+ } 362+ *rp_stop = last_byte; 363+} 364+ 365+#ifdef PNG_MULTY_LINE_ENABLE 366+void 367+png_read_filter_row_avg3_x2_neon(png_row_infop row_info, png_bytep row, 368+ png_const_bytep prev_row) 369+{ 370+ png_bytep rp = row; 371+ png_const_bytep pp = prev_row; 372+ png_bytep rp_stop = row + row_info->rowbytes; 373+ png_bytep np = rp_stop + 1; 374+ 375+ uint8x16_t vtmp; 376+ uint8x8x2_t *vrpt; 377+ uint8x8x2_t vrp; 378+ uint8x8x4_t vdest; 379+ vdest.val[3] = vdup_n_u8(0); 380+ 381+ vtmp = vld1q_u8(rp); 382+ vrpt = png_ptr(uint8x8x2_t,&vtmp); 383+ vrp = *vrpt; 384+ 385+ uint8x8x2_t *vnpt; 386+ uint8x8x2_t vnp; 387+ uint8x8x4_t vdestN; 388+ vdestN.val[3] = vdup_n_u8(0); 389+ 390+ vtmp = vld1q_u8(np); 391+ vnpt = png_ptr(uint8x8x2_t,&vtmp); 392+ vnp = *vnpt; 393 394- uint8x8x2_t *vppt; 395- uint8x8x2_t vpp; 396+ png_debug(1, "in png_read_filter_row_x2_avg3_neon"); 397 398- uint32x2_t *temp_pointer; 399+ uint8x8_t vtmp1, vtmp2, vtmp3; 400+ uint8x8x2_t *vppt; 401+ uint8x8x2_t vpp; 402+ uint32x2_t *temp_pointer; 403 404+ size_t tail_bytes = row_info->rowbytes % 12; 405+ png_byte last_byte = *rp_stop; 406+ png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1); 407+ png_bytep rp_stop_new = rp_stop - tail_bytes; 408+ for (; rp < rp_stop_new; pp += 12) 409+ { 410 vtmp = vld1q_u8(pp); 411 vppt = png_ptr(uint8x8x2_t,&vtmp); 412 vpp = *vppt; 413@@ -212,36 +471,245 @@ png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, 414 rp += 3; 415 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 416 rp += 3; 417+ 418+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3); 419+ vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]); 420+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 421+ 422+ vtmp3 = vext_u8(vnp.val[0], vnp.val[1], 6); 423+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 424+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 425+ 426+ vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1); 427+ 428+ vtmp = vld1q_u8(np + 12); 429+ vnpt = png_ptr(uint8x8x2_t,&vtmp); 430+ vnp = *vnpt; 431+ 432+ vdestN.val[2] = vhadd_u8(vdestN.val[1], vdest.val[2]); 433+ vdestN.val[2] = vadd_u8(vdestN.val[2], vtmp3); 434+ 435+ vdestN.val[3] = vhadd_u8(vdestN.val[2], vdest.val[3]); 436+ vdestN.val[3] = vadd_u8(vdestN.val[3], vtmp1); 437+ 438+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0); 439+ np += 3; 440+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0); 441+ np += 3; 442+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[2]), 0); 443+ np += 3; 444+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[3]), 0); 445+ np += 3; 446 } 447+ 448+ vtmp = vld1q_u8(pp); 449+ vppt = png_ptr(uint8x8x2_t,&vtmp); 450+ vpp = *vppt; 451+ 452+ if (tail_bytes == 3) { 453+ vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 454+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 455+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 456+ 457+ vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]); 458+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 459+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0); 460+ } else if (tail_bytes == 6) { 461+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 462+ vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 463+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 464+ 465+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 466+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 467+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 468+ 469+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 470+ rp += 3; 471+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 472+ 473+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3); 474+ vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]); 475+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 476+ 477+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 478+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 479+ 480+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0); 481+ np += 3; 482+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0); 483+ } else if (tail_bytes == 9) { 484+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 485+ vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 486+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 487+ 488+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 489+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6); 490+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 491+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 492+ 493+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6); 494+ 495+ vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2); 496+ vdest.val[2] = vadd_u8(vdest.val[2], vtmp3); 497+ 498+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 499+ rp += 3; 500+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 501+ rp += 3; 502+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 503+ 504+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3); 505+ vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]); 506+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 507+ 508+ vtmp3 = vext_u8(vnp.val[0], vnp.val[1], 6); 509+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 510+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 511+ 512+ vdestN.val[2] = vhadd_u8(vdestN.val[1], vdest.val[2]); 513+ vdestN.val[2] = vadd_u8(vdestN.val[2], vtmp3); 514+ 515+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0); 516+ np += 3; 517+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0); 518+ np += 3; 519+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[2]), 0); 520+ } 521+ *rp_stop = last_byte; 522+ *(rp_stop + row_info->rowbytes + 1) = last_byte_next; 523 } 524+#endif 525 526 void 527 png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, 528 png_const_bytep prev_row) 529 { 530 png_bytep rp = row; 531- png_bytep rp_stop = row + row_info->rowbytes; 532 png_const_bytep pp = prev_row; 533+ int count = row_info->rowbytes; 534 535 uint8x8x4_t vdest; 536 vdest.val[3] = vdup_n_u8(0); 537 538 png_debug(1, "in png_read_filter_row_avg4_neon"); 539 540- for (; rp < rp_stop; rp += 16, pp += 16) 541- { 542- uint32x2x4_t vtmp; 543- uint8x8x4_t *vrpt, *vppt; 544- uint8x8x4_t vrp, vpp; 545+ uint32x2x4_t vtmp; 546+ uint8x8x4_t *vrpt, *vppt; 547+ uint8x8x4_t vrp, vpp; 548+ uint32x2x4_t vdest_val; 549+ while (count >= 16) { 550 uint32x2x4_t *temp_pointer; 551- uint32x2x4_t vdest_val; 552+ vtmp = vld4_u32(png_ptr(uint32_t,rp)); 553+ vrpt = png_ptr(uint8x8x4_t,&vtmp); 554+ vrp = *vrpt; 555+ vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 556+ vppt = png_ptr(uint8x8x4_t,&vtmp); 557+ vpp = *vppt; 558+ 559+ vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 560+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 561+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); 562+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 563+ vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]); 564+ vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); 565+ vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]); 566+ vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); 567+ 568+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 569+ vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0); 570+ 571+ rp += 16; 572+ pp += 16; 573+ count -= 16; 574+ } 575+ 576+ if (count >= 8) { 577+ uint32x2x2_t vtmp1; 578+ uint8x8x2_t *vrpt1, *vppt1; 579+ uint8x8x2_t vrp1, vpp1; 580+ uint32x2x2_t *temp_pointer; 581+ uint32x2x2_t vdest_val1; 582+ 583+ vtmp1 = vld2_u32(png_ptr(uint32_t,rp)); 584+ vrpt1 = png_ptr(uint8x8x2_t,&vtmp1); 585+ vrp1 = *vrpt1; 586+ vtmp1 = vld2_u32(png_ptrc(uint32_t,pp)); 587+ vppt1 = png_ptr(uint8x8x2_t,&vtmp1); 588+ vpp1 = *vppt1; 589+ 590+ vdest.val[0] = vhadd_u8(vdest.val[3], vpp1.val[0]); 591+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 592+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]); 593+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 594+ 595+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 596+ vst2_lane_u32(png_ptr(uint32_t,rp), vdest_val1, 0); 597+ 598+ rp += 8; 599+ pp += 8; 600+ count -= 8; 601+ } 602+ 603+ if (count == 0) { 604+ return; 605+ } 606+ 607+ uint32x2_t vtmp2; 608+ uint8x8_t *vrpt2, *vppt2; 609+ uint8x8_t vrp2, vpp2; 610+ uint32x2_t *temp_pointer; 611+ uint32x2_t vdest_val2; 612+ 613+ vtmp2 = vld1_u32(png_ptr(uint32_t,rp)); 614+ vrpt2 = png_ptr(uint8x8_t,&vtmp2); 615+ vrp2 = *vrpt2; 616+ vtmp2 = vld1_u32(png_ptrc(uint32_t,pp)); 617+ vppt2 = png_ptr(uint8x8_t,&vtmp2); 618+ vpp2 = *vppt2; 619 620+ vdest.val[0] = vhadd_u8(vdest.val[1], vpp2); 621+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 622+ 623+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 624+ vst1_lane_u32(png_ptr(uint32_t,rp), vdest_val2, 0); 625+} 626+ 627+#ifdef PNG_MULTY_LINE_ENABLE 628+void 629+png_read_filter_row_avg4_x2_neon(png_row_infop row_info, png_bytep row, 630+ png_const_bytep prev_row) 631+{ 632+ png_bytep rp = row; 633+ png_const_bytep pp = prev_row; 634+ int count = row_info->rowbytes; 635+ png_bytep np = row + count + 1; 636+ 637+ uint8x8x4_t vdest; 638+ vdest.val[3] = vdup_n_u8(0); 639+ 640+ png_debug(1, "in png_read_filter_row_avg4_x2_neon"); 641+ 642+ uint32x2x4_t vtmp; 643+ uint8x8x4_t *vrpt, *vppt; 644+ uint8x8x4_t vrp, vpp; 645+ uint32x2x4_t vdest_val; 646+ 647+ uint8x8x4_t *vnpt; 648+ uint8x8x4_t vnp; 649+ uint8x8x4_t vdestN; 650+ vdestN.val[3] = vdup_n_u8(0); 651+ 652+ while (count >= 16) { 653+ uint32x2x4_t *temp_pointer; 654 vtmp = vld4_u32(png_ptr(uint32_t,rp)); 655 vrpt = png_ptr(uint8x8x4_t,&vtmp); 656 vrp = *vrpt; 657 vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 658 vppt = png_ptr(uint8x8x4_t,&vtmp); 659 vpp = *vppt; 660+ vtmp = vld4_u32(png_ptrc(uint32_t,np)); 661+ vnpt = png_ptr(uint8x8x4_t,&vtmp); 662+ vnp = *vnpt; 663 664 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 665 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 666@@ -254,8 +722,97 @@ png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, 667 668 vdest_val = png_ldr(uint32x2x4_t, &vdest); 669 vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0); 670+ 671+ vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]); 672+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 673+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 674+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]); 675+ vdestN.val[2] = vhadd_u8(vdestN.val[1], vdest.val[2]); 676+ vdestN.val[2] = vadd_u8(vdestN.val[2], vnp.val[2]); 677+ vdestN.val[3] = vhadd_u8(vdestN.val[2], vdest.val[3]); 678+ vdestN.val[3] = vadd_u8(vdestN.val[3], vnp.val[3]); 679+ 680+ vdest_val = png_ldr(uint32x2x4_t, &vdestN); 681+ vst4_lane_u32(png_ptr(uint32_t,np), vdest_val, 0); 682+ 683+ rp += 16; 684+ pp += 16; 685+ np += 16; 686+ count -= 16; 687+ } 688+ 689+ if (count >= 8) { 690+ uint32x2x2_t vtmp1; 691+ uint8x8x2_t *vrpt1, *vppt1, *vnpt1; 692+ uint8x8x2_t vrp1, vpp1, vnp1; 693+ uint32x2x2_t *temp_pointer; 694+ uint32x2x2_t vdest_val1; 695+ 696+ vtmp1 = vld2_u32(png_ptr(uint32_t,rp)); 697+ vrpt1 = png_ptr(uint8x8x2_t,&vtmp1); 698+ vrp1 = *vrpt1; 699+ vtmp1 = vld2_u32(png_ptrc(uint32_t,pp)); 700+ vppt1 = png_ptr(uint8x8x2_t,&vtmp1); 701+ vpp1 = *vppt1; 702+ vtmp1 = vld2_u32(png_ptrc(uint32_t,np)); 703+ vnpt1 = png_ptr(uint8x8x2_t,&vtmp1); 704+ vnp1 = *vnpt1; 705+ 706+ vdest.val[0] = vhadd_u8(vdest.val[3], vpp1.val[0]); 707+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 708+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]); 709+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 710+ 711+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 712+ vst2_lane_u32(png_ptr(uint32_t,rp), vdest_val1, 0); 713+ 714+ vdestN.val[0] = vhadd_u8(vdestN.val[3], vdest.val[0]); 715+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]); 716+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 717+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]); 718+ 719+ vdest_val1 = png_ldr(uint32x2x2_t, &vdestN); 720+ vst2_lane_u32(png_ptr(uint32_t,np), vdest_val1, 0); 721+ 722+ rp += 8; 723+ pp += 8; 724+ np += 8; 725+ count -= 8; 726 } 727+ 728+ if (count == 0) { 729+ return; 730+ } 731+ 732+ uint32x2_t vtmp2; 733+ uint8x8_t *vrpt2, *vppt2, *vnpt2; 734+ uint8x8_t vrp2, vpp2, vnp2; 735+ uint32x2_t *temp_pointer; 736+ uint32x2_t vdest_val2; 737+ 738+ vtmp2 = vld1_u32(png_ptr(uint32_t,rp)); 739+ vrpt2 = png_ptr(uint8x8_t,&vtmp2); 740+ vrp2 = *vrpt2; 741+ vtmp2 = vld1_u32(png_ptrc(uint32_t,pp)); 742+ vppt2 = png_ptr(uint8x8_t,&vtmp2); 743+ vpp2 = *vppt2; 744+ vtmp2 = vld1_u32(png_ptrc(uint32_t,np)); 745+ vnpt2 = png_ptr(uint8x8_t,&vtmp2); 746+ vnp2 = *vnpt2; 747+ 748+ vdest.val[0] = vhadd_u8(vdest.val[1], vpp2); 749+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 750+ 751+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 752+ vst1_lane_u32(png_ptr(uint32_t,rp), vdest_val2, 0); 753+ 754+ vdestN.val[0] = vhadd_u8(vdestN.val[1], vdest.val[0]); 755+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2); 756+ 757+ vdest_val2 = png_ldr(uint32x2_t, &vdestN); 758+ vst1_lane_u32(png_ptr(uint32_t,np), vdest_val2, 0); 759 } 760+#endif 761 762 static uint8x8_t 763 paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c) 764@@ -303,15 +860,145 @@ png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, 765 vrpt = png_ptr(uint8x8x2_t,&vtmp); 766 vrp = *vrpt; 767 768+ uint8x8x2_t *vppt; 769+ uint8x8x2_t vpp; 770+ uint8x8_t vtmp1, vtmp2, vtmp3; 771+ uint32x2_t *temp_pointer; 772+ 773 png_debug(1, "in png_read_filter_row_paeth3_neon"); 774 775- for (; rp < rp_stop; pp += 12) 776+ size_t tail_bytes = row_info->rowbytes % 12; 777+ png_byte last_byte = *rp_stop; 778+ png_bytep rp_stop_new = rp_stop - tail_bytes; 779+ for (; rp < rp_stop_new; pp += 12) 780 { 781- uint8x8x2_t *vppt; 782- uint8x8x2_t vpp; 783- uint8x8_t vtmp1, vtmp2, vtmp3; 784- uint32x2_t *temp_pointer; 785+ vtmp = vld1q_u8(pp); 786+ vppt = png_ptr(uint8x8x2_t,&vtmp); 787+ vpp = *vppt; 788+ 789+ vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 790+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 791 792+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 793+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 794+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 795+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 796+ 797+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6); 798+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6); 799+ vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2); 800+ vdest.val[2] = vadd_u8(vdest.val[2], vtmp1); 801+ 802+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 803+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 804+ 805+ vtmp = vld1q_u8(rp + 12); 806+ vrpt = png_ptr(uint8x8x2_t,&vtmp); 807+ vrp = *vrpt; 808+ 809+ vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3); 810+ vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); 811+ 812+ vlast = vtmp2; 813+ 814+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 815+ rp += 3; 816+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 817+ rp += 3; 818+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 819+ rp += 3; 820+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 821+ rp += 3; 822+ } 823+ 824+ vtmp = vld1q_u8(pp); 825+ vppt = png_ptr(uint8x8x2_t,&vtmp); 826+ vpp = *vppt; 827+ 828+ if (tail_bytes == 3) { 829+ vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 830+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 831+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 832+ } else if (tail_bytes == 6) { 833+ vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 834+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 835+ 836+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 837+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 838+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 839+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 840+ 841+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 842+ rp += 3; 843+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 844+ } else if (tail_bytes == 9) { 845+ vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 846+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 847+ 848+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 849+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 850+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 851+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 852+ 853+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6); 854+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6); 855+ vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2); 856+ vdest.val[2] = vadd_u8(vdest.val[2], vtmp1); 857+ 858+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 859+ rp += 3; 860+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 861+ rp += 3; 862+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 863+ } 864+ *rp_stop = last_byte; 865+} 866+ 867+#ifdef PNG_MULTY_LINE_ENABLE 868+void 869+png_read_filter_row_paeth3_x2_neon(png_row_infop row_info, png_bytep row, 870+ png_const_bytep prev_row) 871+{ 872+ png_bytep rp = row; 873+ png_const_bytep pp = prev_row; 874+ png_bytep rp_stop = row + row_info->rowbytes; 875+ png_bytep np = rp_stop + 1; 876+ 877+ uint8x16_t vtmp; 878+ uint8x8x2_t *vrpt; 879+ uint8x8x2_t vrp; 880+ uint8x8_t vlast = vdup_n_u8(0); 881+ uint8x8x4_t vdest; 882+ vdest.val[3] = vdup_n_u8(0); 883+ 884+ vtmp = vld1q_u8(rp); 885+ vrpt = png_ptr(uint8x8x2_t,&vtmp); 886+ vrp = *vrpt; 887+ 888+ uint8x8x2_t *vppt; 889+ uint8x8x2_t vpp; 890+ uint8x8_t vtmp1, vtmp2, vtmp3; 891+ uint32x2_t *temp_pointer; 892+ 893+ uint8x8x2_t *vnpt; 894+ uint8x8x2_t vnp; 895+ uint8x8_t vlastN = vdup_n_u8(0); 896+ uint8x8x4_t vdestN; 897+ vdestN.val[3] = vdup_n_u8(0); 898+ 899+ vtmp = vld1q_u8(np); 900+ vnpt = png_ptr(uint8x8x2_t,&vtmp); 901+ vnp = *vnpt; 902+ 903+ png_debug(1, "in png_read_filter_row_paeth3_x2_neon"); 904+ 905+ size_t tail_bytes = row_info->rowbytes % 12; 906+ png_byte last_byte = *rp_stop; 907+ png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1); 908+ png_bytep rp_stop_new = rp_stop - tail_bytes; 909+ 910+ for (; rp < rp_stop_new; pp += 12) 911+ { 912 vtmp = vld1q_u8(pp); 913 vppt = png_ptr(uint8x8x2_t,&vtmp); 914 vpp = *vppt; 915@@ -349,15 +1036,123 @@ png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, 916 rp += 3; 917 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 918 rp += 3; 919+ 920+ vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN); 921+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 922+ 923+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3); 924+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 925+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 926+ 927+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 6); 928+ vdestN.val[2] = paeth(vdestN.val[1], vdest.val[2], vdest.val[1]); 929+ vdestN.val[2] = vadd_u8(vdestN.val[2], vtmp1); 930+ 931+ vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1); 932+ 933+ vtmp = vld1q_u8(np + 12); 934+ vnpt = png_ptr(uint8x8x2_t,&vtmp); 935+ vnp = *vnpt; 936+ 937+ vdestN.val[3] = paeth(vdestN.val[2], vdest.val[3], vdest.val[2]); 938+ vdestN.val[3] = vadd_u8(vdestN.val[3], vtmp1); 939+ 940+ vlastN = vdest.val[3]; 941+ 942+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0); 943+ np += 3; 944+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0); 945+ np += 3; 946+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[2]), 0); 947+ np += 3; 948+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[3]), 0); 949+ np += 3; 950 } 951+ 952+ vtmp = vld1q_u8(pp); 953+ vppt = png_ptr(uint8x8x2_t,&vtmp); 954+ vpp = *vppt; 955+ 956+ if (tail_bytes == 3) { 957+ vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 958+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 959+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 960+ 961+ vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN); 962+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 963+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0); 964+ } else if (tail_bytes == 6) { 965+ vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 966+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 967+ 968+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 969+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 970+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 971+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 972+ 973+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 974+ rp += 3; 975+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 976+ 977+ vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN); 978+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 979+ 980+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3); 981+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 982+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 983+ 984+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0); 985+ np += 3; 986+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0); 987+ 988+ } else if (tail_bytes == 9) { 989+ vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 990+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 991+ 992+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 993+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 994+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 995+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 996+ 997+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6); 998+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6); 999+ vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2); 1000+ vdest.val[2] = vadd_u8(vdest.val[2], vtmp1); 1001+ 1002+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 1003+ rp += 3; 1004+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 1005+ rp += 3; 1006+ vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 1007+ 1008+ vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN); 1009+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1010+ 1011+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 3); 1012+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1013+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 1014+ 1015+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], 6); 1016+ vdestN.val[2] = paeth(vdestN.val[1], vdest.val[2], vdest.val[1]); 1017+ vdestN.val[2] = vadd_u8(vdestN.val[2], vtmp1); 1018+ 1019+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[0]), 0); 1020+ np += 3; 1021+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[1]), 0); 1022+ np += 3; 1023+ vst1_lane_u32(png_ptr(uint32_t,np), png_ldr(uint32x2_t,&vdestN.val[2]), 0); 1024+ } 1025+ *rp_stop = last_byte; 1026+ *(rp_stop + row_info->rowbytes + 1) = last_byte_next; 1027 } 1028+#endif 1029 1030 void 1031 png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 1032 png_const_bytep prev_row) 1033 { 1034 png_bytep rp = row; 1035- png_bytep rp_stop = row + row_info->rowbytes; 1036+ int count = row_info->rowbytes; 1037 png_const_bytep pp = prev_row; 1038 1039 uint8x8_t vlast = vdup_n_u8(0); 1040@@ -366,20 +1161,129 @@ png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 1041 1042 png_debug(1, "in png_read_filter_row_paeth4_neon"); 1043 1044- for (; rp < rp_stop; rp += 16, pp += 16) 1045- { 1046- uint32x2x4_t vtmp; 1047- uint8x8x4_t *vrpt, *vppt; 1048- uint8x8x4_t vrp, vpp; 1049+ uint32x2x4_t vtmp; 1050+ uint8x8x4_t *vrpt, *vppt; 1051+ uint8x8x4_t vrp, vpp; 1052+ uint32x2x4_t vdest_val; 1053+ while (count >= 16) { 1054 uint32x2x4_t *temp_pointer; 1055- uint32x2x4_t vdest_val; 1056+ vtmp = vld4_u32(png_ptr(uint32_t,rp)); 1057+ vrpt = png_ptr(uint8x8x4_t,&vtmp); 1058+ vrp = *vrpt; 1059+ vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 1060+ vppt = png_ptr(uint8x8x4_t,&vtmp); 1061+ vpp = *vppt; 1062 1063+ vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 1064+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1065+ vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); 1066+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 1067+ vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]); 1068+ vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); 1069+ vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]); 1070+ vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); 1071+ 1072+ vlast = vpp.val[3]; 1073+ 1074+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 1075+ vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0); 1076+ 1077+ rp += 16; 1078+ pp += 16; 1079+ count -= 16; 1080+ } 1081+ 1082+ if (count >= 8) { 1083+ uint32x2x2_t vtmp1; 1084+ uint8x8x2_t *vrpt1, *vppt1; 1085+ uint8x8x2_t vrp1, vpp1; 1086+ uint32x2x2_t *temp_pointer; 1087+ uint32x2x2_t vdest_val1; 1088+ 1089+ vtmp1 = vld2_u32(png_ptr(uint32_t,rp)); 1090+ vrpt1 = png_ptr(uint8x8x2_t,&vtmp1); 1091+ vrp1 = *vrpt1; 1092+ vtmp1 = vld2_u32(png_ptrc(uint32_t,pp)); 1093+ vppt1 = png_ptr(uint8x8x2_t,&vtmp1); 1094+ vpp1 = *vppt1; 1095+ 1096+ vdest.val[0] = paeth(vdest.val[3], vpp1.val[0], vlast); 1097+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 1098+ vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]); 1099+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 1100+ vlast = vpp1.val[1]; 1101+ 1102+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 1103+ vst2_lane_u32(png_ptr(uint32_t,rp), vdest_val1, 0); 1104+ vdest.val[3] = vdest.val[1]; 1105+ 1106+ rp += 8; 1107+ pp += 8; 1108+ count -= 8; 1109+ } 1110+ 1111+ if (count == 0) { 1112+ return; 1113+ } 1114+ 1115+ uint32x2_t vtmp2; 1116+ uint8x8_t *vrpt2, *vppt2; 1117+ uint8x8_t vrp2, vpp2; 1118+ uint32x2_t *temp_pointer; 1119+ uint32x2_t vdest_val2; 1120+ 1121+ vtmp2 = vld1_u32(png_ptr(uint32_t,rp)); 1122+ vrpt2 = png_ptr(uint8x8_t,&vtmp2); 1123+ vrp2 = *vrpt2; 1124+ vtmp2 = vld1_u32(png_ptrc(uint32_t,pp)); 1125+ vppt2 = png_ptr(uint8x8_t,&vtmp2); 1126+ vpp2 = *vppt2; 1127+ 1128+ vdest.val[0] = paeth(vdest.val[3], vpp2, vlast); 1129+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 1130+ 1131+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 1132+ vst1_lane_u32(png_ptr(uint32_t,rp), vdest_val2, 0); 1133+} 1134+ 1135+#ifdef PNG_MULTY_LINE_ENABLE 1136+void 1137+png_read_filter_row_paeth4_x2_neon(png_row_infop row_info, png_bytep row, 1138+ png_const_bytep prev_row) 1139+{ 1140+ png_bytep rp = row; 1141+ int count = row_info->rowbytes; 1142+ png_const_bytep pp = prev_row; 1143+ png_bytep np = row + row_info->rowbytes + 1; 1144+ 1145+ uint8x8_t vlast = vdup_n_u8(0); 1146+ uint8x8x4_t vdest; 1147+ vdest.val[3] = vdup_n_u8(0); 1148+ 1149+ png_debug(1, "in png_read_filter_row_paeth4_x2_neon"); 1150+ 1151+ uint32x2x4_t vtmp; 1152+ uint8x8x4_t *vrpt, *vppt; 1153+ uint8x8x4_t vrp, vpp; 1154+ uint32x2x4_t vdest_val; 1155+ 1156+ uint8x8x4_t *vnpt; 1157+ uint8x8x4_t vnp; 1158+ uint8x8_t vlastN = vdup_n_u8(0); 1159+ uint8x8x4_t vdestN; 1160+ vdestN.val[3] = vdup_n_u8(0); 1161+ 1162+ while (count >= 16) { 1163+ uint32x2x4_t *temp_pointer; 1164 vtmp = vld4_u32(png_ptr(uint32_t,rp)); 1165 vrpt = png_ptr(uint8x8x4_t,&vtmp); 1166 vrp = *vrpt; 1167 vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 1168 vppt = png_ptr(uint8x8x4_t,&vtmp); 1169 vpp = *vppt; 1170+ vtmp = vld4_u32(png_ptrc(uint32_t,np)); 1171+ vnpt = png_ptr(uint8x8x4_t,&vtmp); 1172+ vnp = *vnpt; 1173 1174 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 1175 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1176@@ -394,8 +1298,107 @@ png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 1177 1178 vdest_val = png_ldr(uint32x2x4_t, &vdest); 1179 vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0); 1180+ 1181+ vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN); 1182+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1183+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1184+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]); 1185+ vdestN.val[2] = paeth(vdestN.val[1], vdest.val[2], vdest.val[1]); 1186+ vdestN.val[2] = vadd_u8(vdestN.val[2], vnp.val[2]); 1187+ vdestN.val[3] = paeth(vdestN.val[2], vdest.val[3], vdest.val[2]); 1188+ vdestN.val[3] = vadd_u8(vdestN.val[3], vnp.val[3]); 1189+ 1190+ vlastN = vdest.val[3]; 1191+ 1192+ vdest_val = png_ldr(uint32x2x4_t, &vdestN); 1193+ vst4_lane_u32(png_ptr(uint32_t,np), vdest_val, 0); 1194+ 1195+ rp += 16; 1196+ pp += 16; 1197+ np += 16; 1198+ count -= 16; 1199 } 1200+ 1201+ if (count >= 8) { 1202+ uint32x2x2_t vtmp1; 1203+ uint8x8x2_t *vrpt1, *vppt1, *vnpt1; 1204+ uint8x8x2_t vrp1, vpp1, vnp1; 1205+ uint32x2x2_t *temp_pointer; 1206+ uint32x2x2_t vdest_val1; 1207+ 1208+ vtmp1 = vld2_u32(png_ptr(uint32_t,rp)); 1209+ vrpt1 = png_ptr(uint8x8x2_t,&vtmp1); 1210+ vrp1 = *vrpt1; 1211+ vtmp1 = vld2_u32(png_ptrc(uint32_t,pp)); 1212+ vppt1 = png_ptr(uint8x8x2_t,&vtmp1); 1213+ vpp1 = *vppt1; 1214+ vtmp1 = vld2_u32(png_ptrc(uint32_t,np)); 1215+ vnpt1 = png_ptr(uint8x8x2_t,&vtmp1); 1216+ vnp1 = *vnpt1; 1217+ 1218+ vdest.val[0] = paeth(vdest.val[3], vpp1.val[0], vlast); 1219+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 1220+ vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]); 1221+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 1222+ 1223+ vlast = vpp1.val[1]; 1224+ 1225+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 1226+ vst2_lane_u32(png_ptr(uint32_t,rp), vdest_val1, 0); 1227+ 1228+ vdest.val[3] = vdest.val[1]; 1229+ 1230+ vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN); 1231+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]); 1232+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1233+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]); 1234+ 1235+ vlastN = vdest.val[1]; 1236+ 1237+ vdest_val1 = png_ldr(uint32x2x2_t, &vdestN); 1238+ vst2_lane_u32(png_ptr(uint32_t,np), vdest_val1, 0); 1239+ 1240+ vdestN.val[3] = vdestN.val[1]; 1241+ 1242+ rp += 8; 1243+ pp += 8; 1244+ np += 8; 1245+ count -= 8; 1246+ } 1247+ 1248+ if (count == 0) { 1249+ return; 1250+ } 1251+ 1252+ uint32x2_t vtmp2; 1253+ uint8x8_t *vrpt2, *vppt2, *vnpt2; 1254+ uint8x8_t vrp2, vpp2, vnp2; 1255+ uint32x2_t *temp_pointer; 1256+ uint32x2_t vdest_val2; 1257+ 1258+ vtmp2 = vld1_u32(png_ptr(uint32_t,rp)); 1259+ vrpt2 = png_ptr(uint8x8_t,&vtmp2); 1260+ vrp2 = *vrpt2; 1261+ vtmp2 = vld1_u32(png_ptrc(uint32_t,pp)); 1262+ vppt2 = png_ptr(uint8x8_t,&vtmp2); 1263+ vpp2 = *vppt2; 1264+ vtmp2 = vld1_u32(png_ptrc(uint32_t,np)); 1265+ vnpt2 = png_ptr(uint8x8_t,&vtmp2); 1266+ vnp2 = *vnpt2; 1267+ 1268+ vdest.val[0] = paeth(vdest.val[3], vpp2, vlast); 1269+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 1270+ 1271+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 1272+ vst1_lane_u32(png_ptr(uint32_t,rp), vdest_val2, 0); 1273+ 1274+ vdestN.val[0] = paeth(vdestN.val[3], vdest.val[0], vlastN); 1275+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2); 1276+ 1277+ vdest_val2 = png_ldr(uint32x2_t, &vdestN); 1278+ vst1_lane_u32(png_ptr(uint32_t,np), vdest_val2, 0); 1279 } 1280+#endif 1281 1282 #endif /* PNG_ARM_NEON_OPT > 0 */ 1283 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */ 1284diff --git a/pngpread.c b/pngpread.c 1285index e283627..f22f6a3 100644 1286--- a/pngpread.c 1287+++ b/pngpread.c 1288@@ -262,11 +262,35@@ png_push_read_chunk(png_structrp png_ptr, png_inforp info_ptr) 1289 else if (chunk_name == png_IDAT) 1290 { 1291 png_ptr->idat_size = png_ptr->push_length; 1292+ 1293+#ifdef PNG_MULTY_LINE_ENABLE 1294+ // init inflate_buff 1295+ if (png_ptr->inflate_buff_max_size < png_ptr->push_length) 1296+ { 1297+ png_free(png_ptr, png_ptr->inflate_buff); 1298+ png_ptr->inflate_buff = png_voidcast(png_bytep, 1299+ png_malloc(png_ptr, png_ptr->push_length)); 1300+ png_ptr->inflate_buff_size = 0; 1301+ } 1302+ png_ptr->inflate_buff_max_size = png_ptr->push_length; 1303+#endif 1304+ 1305 png_ptr->process_mode = PNG_READ_IDAT_MODE; 1306 png_push_have_info(png_ptr, info_ptr); 1307- png_ptr->zstream.avail_out = 1308- (uInt) PNG_ROWBYTES(png_ptr->pixel_depth, 1309- png_ptr->iwidth) + 1; 1310+#ifdef PNG_MULTY_LINE_ENABLE 1311+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1312+ (png_ptr->transformations & PNG_CHECK) == 0) { 1313+ int rest = png_ptr->num_rows - png_ptr->row_number; 1314+ int row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS; 1315+ png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth, 1316+ png_ptr->iwidth) + 1) * row_num; // 一次解压多行 1317+ } 1318+ else 1319+#endif 1320+ { 1321+ png_ptr->zstream.avail_out = 1322+ (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth, png_ptr->iwidth) + 1); 1323+ } 1324 png_ptr->zstream.next_out = png_ptr->row_buf; 1325 return; 1326 } 1327@@ -558,8 +582,110 @@ png_push_read_IDAT(png_structrp png_ptr) 1328 } 1329 1330 png_ptr->idat_size = png_ptr->push_length; 1331+#ifdef PNG_MULTY_LINE_ENABLE 1332+ // init inflate_buff 1333+ if (png_ptr->inflate_buff_max_size < png_ptr->push_length) 1334+ { 1335+ png_free(png_ptr, png_ptr->inflate_buff); 1336+ png_ptr->inflate_buff = png_voidcast(png_bytep, 1337+ png_malloc(png_ptr, png_ptr->push_length)); 1338+ png_ptr->inflate_buff_size = 0; 1339+ } 1340+ png_ptr->inflate_buff_max_size = png_ptr->push_length; 1341+#endif 1342 } 1343 1344+#ifdef PNG_MULTY_LINE_ENABLE 1345+ if (png_ptr->idat_size != 0 && png_ptr->save_buffer_size != 0) 1346+ { 1347+ if (png_ptr->idat_size <= png_ptr->save_buffer_size) 1348+ { 1349+ png_debug2(1, "png_IDAT1: idat_size=%d save_buffer_size=%ld", 1350+ png_ptr->idat_size, png_ptr->save_buffer_size); 1351+ 1352+ size_t save_size = png_ptr->idat_size; 1353+ 1354+ png_calculate_crc(png_ptr, png_ptr->save_buffer_ptr, save_size); 1355+ png_process_IDAT_data(png_ptr, png_ptr->save_buffer_ptr, save_size); 1356+ 1357+ png_ptr->buffer_size -= save_size; 1358+ png_ptr->save_buffer_size -= save_size; 1359+ png_ptr->save_buffer_ptr += save_size; 1360+ png_ptr->idat_size = 0; 1361+ } 1362+ 1363+ else 1364+ { 1365+ png_debug2(1, "png_IDAT2: idat_size=%d save_buffer_size=%ld", 1366+ png_ptr->idat_size, png_ptr->save_buffer_size); 1367+ 1368+ size_t save_size = png_ptr->save_buffer_size; 1369+ 1370+ memcpy(png_ptr->inflate_buff, png_ptr->save_buffer_ptr, save_size); 1371+ 1372+ png_ptr->inflate_buff_size = save_size; 1373+ png_ptr->buffer_size -= save_size; 1374+ png_ptr->save_buffer_ptr += save_size; 1375+ png_ptr->save_buffer_size = 0; 1376+ } 1377+ } 1378+ 1379+ if (png_ptr->idat_size != 0 && png_ptr->current_buffer_size != 0) 1380+ { 1381+ size_t save_size = png_ptr->current_buffer_size; 1382+ if (png_ptr->idat_size > png_ptr->inflate_buff_size + save_size) 1383+ { 1384+ png_debug2(1, "png_IDAT3: inflate_buff_size=%ld current_buffer_size=%ld", 1385+ png_ptr->inflate_buff_size, save_size); 1386+ 1387+ memcpy(png_ptr->inflate_buff + png_ptr->inflate_buff_size, 1388+ png_ptr->current_buffer_ptr, save_size); 1389+ 1390+ png_ptr->inflate_buff_size += save_size; 1391+ png_ptr->buffer_size -= save_size; 1392+ png_ptr->current_buffer_ptr += save_size; 1393+ png_ptr->current_buffer_size = 0; 1394+ } 1395+ 1396+ else 1397+ { 1398+ if (png_ptr->inflate_buff_size == 0) 1399+ { 1400+ png_debug2(1, "png_IDAT4: inflate_buff_size=%ld current_buffer_size=%ld", 1401+ png_ptr->inflate_buff_size, save_size); 1402+ 1403+ save_size = png_ptr->idat_size; 1404+ 1405+ png_calculate_crc(png_ptr, png_ptr->current_buffer_ptr, save_size); 1406+ png_process_IDAT_data(png_ptr, png_ptr->current_buffer_ptr, save_size); 1407+ 1408+ png_ptr->buffer_size -= save_size; 1409+ png_ptr->current_buffer_size -= save_size; 1410+ png_ptr->current_buffer_ptr += save_size; 1411+ png_ptr->idat_size = 0; 1412+ } 1413+ 1414+ else 1415+ { 1416+ save_size = png_ptr->idat_size - png_ptr->inflate_buff_size; 1417+ png_debug2(1, "png_IDAT5: inflate_buff_size=%ld save_size=%ld", 1418+ png_ptr->inflate_buff_size, save_size); 1419+ 1420+ memcpy(png_ptr->inflate_buff + png_ptr->inflate_buff_size, 1421+ png_ptr->current_buffer_ptr, save_size); 1422+ 1423+ png_ptr->inflate_buff_size = 0; 1424+ png_calculate_crc(png_ptr, png_ptr->inflate_buff, png_ptr->idat_size); 1425+ png_process_IDAT_data(png_ptr, png_ptr->inflate_buff, png_ptr->idat_size); 1426+ 1427+ png_ptr->buffer_size -= save_size; 1428+ png_ptr->current_buffer_size -= save_size; 1429+ png_ptr->current_buffer_ptr += save_size; 1430+ png_ptr->idat_size = 0; 1431+ } 1432+ } 1433+ } 1434+#else 1435 if (png_ptr->idat_size != 0 && png_ptr->save_buffer_size != 0) 1436 { 1437 size_t save_size = png_ptr->save_buffer_size; 1438@@ -612,6 +738,7 @@ png_push_read_IDAT(png_structrp png_ptr) 1439 png_ptr->current_buffer_size -= save_size; 1440 png_ptr->current_buffer_ptr += save_size; 1441 } 1442+#endif 1443 1444 if (png_ptr->idat_size == 0) 1445 { 1446@@ -623,6 +750,98 @@ png_push_read_IDAT(png_structrp png_ptr) 1447 } 1448 } 1449 1450+#ifdef PNG_MULTY_LINE_ENABLE 1451+void /* PRIVATE */ 1452+png_push_process_row_x2(png_structrp png_ptr, png_row_info row_info_origin) 1453+{ 1454+ png_debug(1, "in png_push_process_row_x2"); 1455+ /* 1.5.6: row_info moved out of png_struct to a local here. */ 1456+ png_row_info row_info = row_info_origin; 1457+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, 1458+ png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4); 1459+ 1460+ /* libpng 1.5.6: the following line was copying png_ptr->rowbytes before 1461+ * 1.5.6, while the buffer really is this big in current versions of libpng 1462+ * it may not be in the future, so this was changed just to copy the 1463+ * interlaced row count: 1464+ */ 1465+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1466+ if (png_ptr->transformations != 0) 1467+ png_do_read_transformations(png_ptr, &row_info); 1468+#endif 1469+ 1470+ /* The transformed pixel depth should match the depth now in row_info. */ 1471+ if (png_ptr->transformed_pixel_depth == 0) 1472+ { 1473+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1474+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1475+ png_error(png_ptr, "progressive row overflow"); 1476+ } 1477+ 1478+ png_push_have_row(png_ptr, png_ptr->row_buf + 1); 1479+ png_read_push_finish_row(png_ptr); 1480+ 1481+ png_ptr->row_buf = png_ptr->row_buf + png_ptr->rowbytes + 1; 1482+ 1483+ // do it again 1484+ if (png_ptr->transformations != 0) 1485+ { 1486+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1487+ } 1488+ else 1489+ { 1490+ png_ptr->prev_row = png_ptr->row_buf; 1491+ } 1492+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1493+ if (png_ptr->transformations != 0) 1494+ png_do_read_transformations(png_ptr, &row_info); 1495+#endif 1496+ 1497+ png_push_have_row(png_ptr, png_ptr->row_buf + 1); 1498+ png_read_push_finish_row(png_ptr); 1499+} 1500+ 1501+void png_push_process_multi_rows(png_structrp png_ptr, int row_num) 1502+{ 1503+ png_debug(1, "in png_push_process_multi_rows"); 1504+ uInt row_bytes = png_ptr->rowbytes + 1; 1505+ 1506+ png_row_info row_info; 1507+ row_info.width = png_ptr->iwidth; 1508+ row_info.color_type = png_ptr->color_type; 1509+ row_info.bit_depth = png_ptr->bit_depth; 1510+ row_info.channels = png_ptr->channels; 1511+ row_info.pixel_depth = png_ptr->pixel_depth; 1512+ row_info.rowbytes = png_ptr->rowbytes; 1513+ 1514+ png_bytep temp_row = png_ptr->row_buf; 1515+ png_bytep temp_prev_row = png_ptr->prev_row; 1516+ 1517+ for (int i = 0; i < row_num; i++) { 1518+ if ((png_ptr->channels == 3 || png_ptr->channels == 4) && 1519+ i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB && 1520+ png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST && 1521+ png_ptr->row_buf[0] == png_ptr->row_buf[row_bytes] 1522+ ) 1523+ { 1524+ png_push_process_row_x2(png_ptr, row_info); 1525+ png_ptr->row_buf = png_ptr->row_buf + row_bytes; 1526+ i++; 1527+ continue; 1528+ } 1529+ png_push_process_row(png_ptr); 1530+ png_ptr->row_buf = png_ptr->row_buf + row_bytes; 1531+ } 1532+ 1533+ if (png_ptr->transformations == 0 && png_ptr->interlaced == 0) 1534+ { 1535+ png_ptr->prev_row = temp_prev_row; 1536+ memcpy(png_ptr->prev_row, png_ptr->row_buf - row_bytes, row_bytes); 1537+ } 1538+ png_ptr->row_buf = temp_row; 1539+} 1540+#endif 1541+ 1542 void /* PRIVATE */ 1543 png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1544 size_t buffer_length) 1545@@ -639,6 +858,16 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1546 /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */ 1547 png_ptr->zstream.avail_in = (uInt)buffer_length; 1548 1549+ int row_num = 1; 1550+#ifdef PNG_MULTY_LINE_ENABLE 1551+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1552+ (png_ptr->transformations & PNG_CHECK) == 0) 1553+ { 1554+ int rest = png_ptr->num_rows - png_ptr->row_number; 1555+ row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS; 1556+ } 1557+#endif 1558+ 1559 /* Keep going until the decompressed data is all processed 1560 * or the stream marked as finished. 1561 */ 1562@@ -655,8 +884,16 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1563 if (!(png_ptr->zstream.avail_out > 0)) 1564 { 1565 /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */ 1566+#ifdef PNG_MULTY_LINE_ENABLE 1567+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1568+ (png_ptr->transformations & PNG_CHECK) == 0) 1569+ { 1570+ int rest = png_ptr->num_rows - png_ptr->row_number; 1571+ row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS; 1572+ } 1573+#endif 1574 png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth, 1575- png_ptr->iwidth) + 1); 1576+ png_ptr->iwidth) + 1) * row_num; 1577 1578 png_ptr->zstream.next_out = png_ptr->row_buf; 1579 } 1580@@ -719,7 +956,11 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1581 1582 /* Do we have a complete row? */ 1583 if (png_ptr->zstream.avail_out == 0) 1584+#ifdef PNG_MULTY_LINE_ENABLE 1585+ png_push_process_multi_rows(png_ptr, row_num); 1586+#else 1587 png_push_process_row(png_ptr); 1588+#endif 1589 } 1590 1591 /* And check for the end of the stream. */ 1592@@ -738,6 +979,7 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1593 void /* PRIVATE */ 1594 png_push_process_row(png_structrp png_ptr) 1595 { 1596+ png_debug(1, "in png_push_process_row"); 1597 /* 1.5.6: row_info moved out of png_struct to a local here. */ 1598 png_row_info row_info; 1599 1600@@ -762,8 +1004,16 @@ png_push_process_row(png_structrp png_ptr) 1601 * it may not be in the future, so this was changed just to copy the 1602 * interlaced row count: 1603 */ 1604- memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1605- 1606+#ifdef PNG_MULTY_LINE_ENABLE 1607+ if (png_ptr->transformations == 0 && png_ptr->interlaced == 0) 1608+ { 1609+ png_ptr->prev_row = png_ptr->row_buf; 1610+ } 1611+ else 1612+#endif 1613+ { 1614+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1615+ } 1616 #ifdef PNG_READ_TRANSFORMS_SUPPORTED 1617 if (png_ptr->transformations != 0) 1618 png_do_read_transformations(png_ptr, &row_info); 1619diff --git a/pngpriv.h b/pngpriv.h 1620index 2e426cf..6d60f70 100644 1621--- a/pngpriv.h 1622+++ b/pngpriv.h 1623@@ -127,7 +127,7 @@ 1624 * associated assembler code, pass --enable-arm-neon=no to configure 1625 * or put -DPNG_ARM_NEON_OPT=0 in CPPFLAGS. 1626 */ 1627-# if defined(PNG_ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \ 1628+# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \ 1629 defined(PNG_ALIGNED_MEMORY_SUPPORTED) 1630 # define PNG_ARM_NEON_OPT 2 1631 # else 1632@@ -1304,6 +1315,18 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_neon,(png_row_infop 1633 row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1634 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_neon,(png_row_infop 1635 row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1636+#ifdef PNG_MULTY_LINE_ENABLE 1637+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_x2_neon,(png_row_infop 1638+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1639+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_x2_neon,(png_row_infop 1640+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1641+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_x2_neon,(png_row_infop 1642+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1643+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_x2_neon,(png_row_infop 1644+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1645+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_x2_neon,(png_row_infop 1646+ row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1647+#endif 1648 #endif 1649 1650 #if PNG_MIPS_MSA_OPT > 0 1651diff --git a/pngread.c b/pngread.c 1652index 5ab9224..6efc5b6 100644 1653--- a/pngread.c 1654+++ b/pngread.c 1655@@ -54,7 +54,11 @@ png_create_read_struct_2,(png_const_charp user_png_ver, png_voidp error_ptr, 1656 * required (it will be zero in a write structure.) 1657 */ 1658 # ifdef PNG_SEQUENTIAL_READ_SUPPORTED 1659+#ifdef PNG_MULTY_LINE_ENABLE 1660+ png_ptr->IDAT_read_size = PNG_INFLATE_MAX_SIZE; 1661+#else 1662 png_ptr->IDAT_read_size = PNG_IDAT_READ_SIZE; 1663+#endif 1664 # endif 1665 1666 # ifdef PNG_BENIGN_READ_ERRORS_SUPPORTED 1667@@ -684,6 +688,184 @@ png_read_rows(png_structrp png_ptr, png_bytepp row, 1668 #endif /* SEQUENTIAL_READ */ 1669 1670 #ifdef PNG_SEQUENTIAL_READ_SUPPORTED 1671+ 1672+#ifdef PNG_MULTY_LINE_ENABLE 1673+void png_read_two_rows(png_structrp png_ptr, png_bytepp rows, png_uint_32 i, 1674+ png_row_info row_info) 1675+{ 1676+ png_debug1(1, "in png_read_two_rows %d", png_ptr->row_buf[0]); 1677+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, 1678+ png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4); 1679+ 1680+#ifdef PNG_MNG_FEATURES_SUPPORTED 1681+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 && 1682+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING)) 1683+ { 1684+ /* Intrapixel differencing */ 1685+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1); 1686+ } 1687+#endif 1688+ 1689+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1690+ if (png_ptr->transformations 1691+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED 1692+ || png_ptr->num_palette_max >= 0 1693+# endif 1694+ ) 1695+ png_do_read_transformations(png_ptr, &row_info); 1696+#endif 1697+ 1698+ /* The transformed pixel depth should match the depth now in row_info. */ 1699+ if (png_ptr->transformed_pixel_depth == 0) 1700+ { 1701+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1702+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1703+ png_error(png_ptr, "sequential row overflow"); 1704+ } 1705+ 1706+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth) 1707+ png_error(png_ptr, "internal sequential row size calculation error"); 1708+ 1709+ 1710+ if (rows[i] != NULL) 1711+ png_combine_row(png_ptr, rows[i], -1/*ignored*/); 1712+ 1713+ png_read_finish_row(png_ptr); 1714+ 1715+ if (png_ptr->read_row_fn != NULL) 1716+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass); 1717+ 1718+ png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1; 1719+ 1720+ // do again next line 1721+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1722+ 1723+#ifdef PNG_MNG_FEATURES_SUPPORTED 1724+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 && 1725+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING)) 1726+ { 1727+ /* Intrapixel differencing */ 1728+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1); 1729+ } 1730+#endif 1731+ 1732+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1733+ if (png_ptr->transformations 1734+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED 1735+ || png_ptr->num_palette_max >= 0 1736+# endif 1737+ ) 1738+ png_do_read_transformations(png_ptr, &row_info); 1739+#endif 1740+ 1741+ /* The transformed pixel depth should match the depth now in row_info. */ 1742+ if (png_ptr->transformed_pixel_depth == 0) 1743+ { 1744+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1745+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1746+ png_error(png_ptr, "sequential row overflow"); 1747+ } 1748+ 1749+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth) 1750+ png_error(png_ptr, "internal sequential row size calculation error"); 1751+ 1752+ 1753+ if (rows[i+1] != NULL) 1754+ png_combine_row(png_ptr, rows[i+1], -1/*ignored*/); 1755+ 1756+ png_read_finish_row(png_ptr); 1757+ 1758+ if (png_ptr->read_row_fn != NULL) 1759+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass); 1760+ 1761+ png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1; 1762+ 1763+} 1764+ 1765+void png_read_muilty_rows(png_structrp png_ptr, png_bytepp rows, png_uint_32 row_num, 1766+ png_row_info row_info_origin) 1767+{ 1768+ if (png_ptr == NULL) 1769+ return; 1770+ 1771+ png_debug2(1, "in png_read_muilty_rows (row %lu, pass %d)", 1772+ (unsigned long)png_ptr->row_number, png_ptr->pass); 1773+ 1774+ if ((png_ptr->mode & PNG_HAVE_IDAT) == 0) 1775+ png_error(png_ptr, "Invalid attempt to read row data"); 1776+ 1777+ /* Fill the row with IDAT data: */ 1778+ uInt row_bytes = row_info_origin.rowbytes; 1779+ png_ptr->row_buf[0]=255; /* to force error if no data was found */ 1780+ png_read_IDAT_data(png_ptr, png_ptr->row_buf, (row_bytes + 1) * row_num); 1781+ png_bytep temp_row = png_ptr->row_buf; 1782+ 1783+ for (png_uint_32 i = 0; i < row_num; i++) { 1784+ png_row_info row_info = row_info_origin; 1785+ if ((row_info_origin.channels == 3 || row_info_origin.channels == 4) && 1786+ i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB && 1787+ png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST && 1788+ png_ptr->row_buf[0] == png_ptr->row_buf[row_info_origin.rowbytes + 1] 1789+ ) { 1790+ png_read_two_rows(png_ptr, rows, i, row_info); 1791+ i++; 1792+ continue; 1793+ } 1794+ if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE) 1795+ { 1796+ if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST) 1797+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, 1798+ png_ptr->prev_row + 1, png_ptr->row_buf[0]); 1799+ else 1800+ png_debug1(1, "bad adaptive filter value %d", png_ptr->row_buf[0]); 1801+ } 1802+ 1803+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info_origin.rowbytes + 1); 1804+ 1805+#ifdef PNG_MNG_FEATURES_SUPPORTED 1806+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 && 1807+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING)) 1808+ { 1809+ /* Intrapixel differencing */ 1810+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1); 1811+ } 1812+#endif 1813+ 1814+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1815+ if (png_ptr->transformations 1816+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED 1817+ || png_ptr->num_palette_max >= 0 1818+# endif 1819+ ) 1820+ png_do_read_transformations(png_ptr, &row_info); 1821+#endif 1822+ 1823+ /* The transformed pixel depth should match the depth now in row_info. */ 1824+ if (png_ptr->transformed_pixel_depth == 0) 1825+ { 1826+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1827+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1828+ png_error(png_ptr, "sequential row overflow"); 1829+ } 1830+ 1831+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth) 1832+ png_error(png_ptr, "internal sequential row size calculation error"); 1833+ 1834+ 1835+ if (rows[i] != NULL) 1836+ png_combine_row(png_ptr, rows[i], -1/*ignored*/); 1837+ 1838+ png_read_finish_row(png_ptr); 1839+ 1840+ if (png_ptr->read_row_fn != NULL) 1841+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass); 1842+ 1843+ png_ptr->row_buf = png_ptr->row_buf + row_bytes + 1; 1844+ } 1845+ png_ptr->row_buf = temp_row; 1846+} 1847+#endif 1848+ 1849 /* Read the entire image. If the image has an alpha channel or a tRNS 1850 * chunk, and you have called png_handle_alpha()[*], you will need to 1851 * initialize the image to the current image that PNG will be overlaying. 1852@@ -745,13 +927,82 @@ png_read_image(png_structrp png_ptr, png_bytepp image) 1853 1854 image_height=png_ptr->height; 1855 1856- for (j = 0; j < pass; j++) 1857- { 1858+#ifdef PNG_MULTY_LINE_ENABLE 1859+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1860+ (png_ptr->transformations & PNG_CHECK) == 0) { 1861+ if ((png_ptr->flags & PNG_FLAG_ROW_INIT) == 0) 1862+ png_read_start_row(png_ptr); 1863+ 1864+#ifdef PNG_WARNINGS_SUPPORTED 1865+ /* Check for transforms that have been set but were defined out */ 1866+#if defined(PNG_WRITE_INVERT_SUPPORTED) && !defined(PNG_READ_INVERT_SUPPORTED) 1867+ if ((png_ptr->transformations & PNG_INVERT_MONO) != 0) 1868+ png_warning(png_ptr, "PNG_READ_INVERT_SUPPORTED is not defined"); 1869+#endif 1870+ 1871+#if defined(PNG_WRITE_FILLER_SUPPORTED) && !defined(PNG_READ_FILLER_SUPPORTED) 1872+ if ((png_ptr->transformations & PNG_FILLER) != 0) 1873+ png_warning(png_ptr, "PNG_READ_FILLER_SUPPORTED is not defined"); 1874+#endif 1875+ 1876+#if defined(PNG_WRITE_PACKSWAP_SUPPORTED) && \ 1877+ !defined(PNG_READ_PACKSWAP_SUPPORTED) 1878+ if ((png_ptr->transformations & PNG_PACKSWAP) != 0) 1879+ png_warning(png_ptr, "PNG_READ_PACKSWAP_SUPPORTED is not defined"); 1880+#endif 1881+ 1882+#if defined(PNG_WRITE_PACK_SUPPORTED) && !defined(PNG_READ_PACK_SUPPORTED) 1883+ if ((png_ptr->transformations & PNG_PACK) != 0) 1884+ png_warning(png_ptr, "PNG_READ_PACK_SUPPORTED is not defined"); 1885+#endif 1886+ 1887+#if defined(PNG_WRITE_SHIFT_SUPPORTED) && !defined(PNG_READ_SHIFT_SUPPORTED) 1888+ if ((png_ptr->transformations & PNG_SHIFT) != 0) 1889+ png_warning(png_ptr, "PNG_READ_SHIFT_SUPPORTED is not defined"); 1890+#endif 1891+ 1892+#if defined(PNG_WRITE_BGR_SUPPORTED) && !defined(PNG_READ_BGR_SUPPORTED) 1893+ if ((png_ptr->transformations & PNG_BGR) != 0) 1894+ png_warning(png_ptr, "PNG_READ_BGR_SUPPORTED is not defined"); 1895+#endif 1896+ 1897+#if defined(PNG_WRITE_SWAP_SUPPORTED) && !defined(PNG_READ_SWAP_SUPPORTED) 1898+ if ((png_ptr->transformations & PNG_SWAP_BYTES) != 0) 1899+ png_warning(png_ptr, "PNG_READ_SWAP_SUPPORTED is not defined"); 1900+#endif 1901+#endif /* WARNINGS */ 1902+ 1903+ png_row_info row_info; 1904+ row_info.width = png_ptr->iwidth; 1905+ row_info.color_type = png_ptr->color_type; 1906+ row_info.bit_depth = png_ptr->bit_depth; 1907+ row_info.channels = png_ptr->channels; 1908+ row_info.pixel_depth = png_ptr->pixel_depth; 1909+ row_info.rowbytes = png_ptr->rowbytes; 1910+ 1911 rp = image; 1912- for (i = 0; i < image_height; i++) 1913+ int row_num = PNG_INFLATE_ROWS; 1914+ for (i = 0; i < image_height; i += PNG_INFLATE_ROWS) 1915 { 1916- png_read_row(png_ptr, *rp, NULL); 1917- rp++; 1918+ if (image_height - i < PNG_INFLATE_ROWS) 1919+ { 1920+ row_num = image_height - i; 1921+ } 1922+ png_read_muilty_rows(png_ptr, rp, row_num, row_info); 1923+ rp += row_num; 1924+ } 1925+ } 1926+ else 1927+#endif 1928+ { 1929+ for (j = 0; j < pass; j++) 1930+ { 1931+ rp = image; 1932+ for (i = 0; i < image_height; i++) 1933+ { 1934+ png_read_row(png_ptr, *rp, NULL); 1935+ rp++; 1936+ } 1937 } 1938 } 1939 } 1940@@ -1000,6 +1251,10 @@ png_read_destroy(png_structrp png_ptr) 1941 png_ptr->riffled_palette = NULL; 1942 #endif 1943 1944+#ifdef PNG_MULTY_LINE_ENABLE 1945+ png_free(png_ptr, png_ptr->inflate_buff); 1946+ png_ptr->inflate_buff = NULL; 1947+#endif 1948 /* NOTE: the 'setjmp' buffer may still be allocated and the memory and error 1949 * callbacks are still set at this point. They are required to complete the 1950 * destruction of the png_struct itself. 1951diff --git a/pngrutil.c b/pngrutil.c 1952index ca060dd..c3c177c 100644 1953--- a/pngrutil.c 1954+++ b/pngrutil.c 1955@@ -4136,7 +4136,7 @@ png_read_filter_row(png_structrp pp, png_row_infop row_info, png_bytep row, 1956 * PNG_FILTER_OPTIMIZATIONS to a function that overrides the generic 1957 * implementations. See png_init_filter_functions above. 1958 */ 1959- if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST) 1960+ if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST_X2) 1961 { 1962 if (pp->read_filter[0] == NULL) 1963 png_init_filter_functions(pp); 1964@@ -4604,11 +4604,24 @@ defined(PNG_USER_TRANSFORM_PTR_SUPPORTED) 1965 png_free(png_ptr, png_ptr->big_prev_row); 1966 1967 if (png_ptr->interlaced != 0) 1968- png_ptr->big_row_buf = (png_bytep)png_calloc(png_ptr, 1969- row_bytes + 48); 1970+ { 1971+ png_ptr->big_row_buf = (png_bytep)png_calloc(png_ptr, row_bytes + 48); 1972+ } 1973 1974 else 1975- png_ptr->big_row_buf = (png_bytep)png_malloc(png_ptr, row_bytes + 48); 1976+ { 1977+ png_uint_32 row_num = 1; 1978+#ifdef PNG_MULTY_LINE_ENABLE 1979+ if (png_ptr->bit_depth == 8 && 1980+ (png_ptr->transformations & PNG_CHECK) == 0) 1981+ { 1982+ row_num = png_ptr->height < PNG_INFLATE_ROWS ? 1983+ png_ptr->height : PNG_INFLATE_ROWS; 1984+ } 1985+#endif 1986+ png_ptr->big_row_buf = 1987+ (png_bytep)png_malloc(png_ptr, row_bytes * row_num + 48); 1988+ } 1989 1990 png_ptr->big_prev_row = (png_bytep)png_malloc(png_ptr, row_bytes + 48); 1991 1992diff --git a/pngstruct.h b/pngstruct.h 1993index e591d94..1875c7a 100644 1994--- a/pngstruct.h 1995+++ b/pngstruct.h 1996@@ -140,6 +140,16 @@ typedef const png_colorspace * PNG_RESTRICT png_const_colorspacerp; 1997 #define PNG_COLORSPACE_CANCEL(flags) (0xffff ^ (flags)) 1998 #endif /* COLORSPACE || GAMMA */ 1999 2000+#ifdef PNG_MULTY_LINE_ENABLE 2001+/* General flags for the 2 line filter */ 2002+#define PNG_FILTER_VALUE_UP_X2 6 // PNG_FILTER_VALUE_UP + 4 2003+#define PNG_FILTER_VALUE_AVG_X2 7 // PNG_FILTER_VALUE_AVG + 4 2004+#define PNG_FILTER_VALUE_PAETH_X2 8 // PNG_FILTER_VALUE_PAETH + 4 2005+#define PNG_FILTER_VALUE_LAST_X2 9 // PNG_FILTER_VALUE_LAST + 4 2006+#else 2007+#define PNG_FILTER_VALUE_LAST_X2 5 // PNG_FILTER_VALUE_LAST 2008+#endif 2009+ 2010 struct png_struct_def 2011 { 2012 #ifdef PNG_SETJMP_SUPPORTED 2013@@ -467,7 +477,7 @@ struct png_struct_def 2014 png_bytep big_prev_row; 2015 2016 /* New member added in libpng-1.5.7 */ 2017- void (*read_filter[PNG_FILTER_VALUE_LAST-1])(png_row_infop row_info, 2018+ void (*read_filter[PNG_FILTER_VALUE_LAST_X2-1])(png_row_infop row_info, 2019 png_bytep row, png_const_bytep prev_row); 2020 2021 #ifdef PNG_READ_SUPPORTED 2022@@ -475,5 +485,11 @@ struct png_struct_def 2023 png_colorspace colorspace; 2024 #endif 2025 #endif 2026+ 2027+#ifdef PNG_MULTY_LINE_ENABLE 2028+ png_bytep inflate_buff; 2029+ png_uint_32 inflate_buff_max_size; 2030+ png_uint_32 inflate_buff_size; 2031+#endif 2032 }; 2033 #endif /* PNGSTRUCT_H */ 2034diff --git a/pngtrans.c b/pngtrans.c 2035index 1100f46..4860e20 100644 2036--- a/pngtrans.c 2037+++ b/pngtrans.c 2038@@ -13,6 +13,17 @@ 2039 2040 #include "pngpriv.h" 2041 2042+#ifdef PNG_ARM_NEON_IMPLEMENTATION 2043+# if PNG_ARM_NEON_IMPLEMENTATION == 1 2044+# define PNG_ARM_NEON_INTRINSICS_AVAILABLE 2045+# if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64) 2046+# include <arm64_neon.h> 2047+# else 2048+# include <arm_neon.h> 2049+# endif 2050+# endif 2051+#endif 2052+ 2053 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED) 2054 2055 #if defined(PNG_READ_BGR_SUPPORTED) || defined(PNG_WRITE_BGR_SUPPORTED) 2056@@ -269,13 +280,19 @@ png_do_invert(png_row_infop row_info, png_bytep row) 2057 if (row_info->color_type == PNG_COLOR_TYPE_GRAY) 2058 { 2059 png_bytep rp = row; 2060- size_t i; 2061- size_t istop = row_info->rowbytes; 2062- 2063- for (i = 0; i < istop; i++) 2064+ png_bytep rp_stop = row + row_info->rowbytes; 2065+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE 2066+ png_bytep rp_stop_neon = row + row_info->rowbytes - 16; 2067+ for (; rp < rp_stop_neon; rp += 16) 2068+ { 2069+ uint8x16_t gray = vld1q_u8(rp); 2070+ gray = ~gray; 2071+ vst1q_u8(rp, gray); 2072+ } 2073+#endif 2074+ for (; rp < rp_stop; rp++) 2075 { 2076 *rp = (png_byte)(~(*rp)); 2077- rp++; 2078 } 2079 } 2080 2081@@ -283,10 +300,17 @@ png_do_invert(png_row_infop row_info, png_bytep row) 2082 row_info->bit_depth == 8) 2083 { 2084 png_bytep rp = row; 2085- size_t i; 2086- size_t istop = row_info->rowbytes; 2087- 2088- for (i = 0; i < istop; i += 2) 2089+ png_bytep rp_stop = row + row_info->rowbytes; 2090+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE 2091+ png_bytep rp_stop_neon = row + row_info->rowbytes - 32; 2092+ for (; rp < rp_stop_neon; rp += 32) 2093+ { 2094+ uint8x16x2_t gray_alpha = vld2q_u8(rp); 2095+ gray_alpha.val[0] = ~gray_alpha.val[0]; 2096+ vst2q_u8(rp, gray_alpha); 2097+ } 2098+#endif 2099+ for (; rp < rp_stop; rp += 2) 2100 { 2101 *rp = (png_byte)(~(*rp)); 2102 rp += 2; 2103@@ -298,10 +322,18 @@ png_do_invert(png_row_infop row_info, png_bytep row) 2104 row_info->bit_depth == 16) 2105 { 2106 png_bytep rp = row; 2107- size_t i; 2108- size_t istop = row_info->rowbytes; 2109- 2110- for (i = 0; i < istop; i += 4) 2111+ png_bytep rp_stop = row + row_info->rowbytes; 2112+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE 2113+ png_bytep rp_stop_neon = row + row_info->rowbytes - 64; 2114+ for (; rp < rp_stop_neon; rp += 64) 2115+ { 2116+ uint8x16x4_t gray_alpha = vld4q_u8(rp); 2117+ gray_alpha.val[0] = ~gray_alpha.val[0]; 2118+ gray_alpha.val[1] = ~gray_alpha.val[1]; 2119+ vst4q_u8(rp, gray_alpha); 2120+ } 2121+#endif 2122+ for (; rp < rp_stop; rp += 4) 2123 { 2124 *rp = (png_byte)(~(*rp)); 2125 *(rp + 1) = (png_byte)(~(*(rp + 1))); 2126@@ -323,10 +355,20 @@ png_do_swap(png_row_infop row_info, png_bytep row) 2127 if (row_info->bit_depth == 16) 2128 { 2129 png_bytep rp = row; 2130- png_uint_32 i; 2131- png_uint_32 istop= row_info->width * row_info->channels; 2132+ png_bytep rp_stop = row + row_info->rowbytes; 2133+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE 2134+ png_bytep rp_stop_neon = row + row_info->rowbytes - 32; 2135+ for (; rp < rp_stop_neon; rp += 32) 2136+ { 2137+ uint8x16x2_t gray = vld2q_u8(rp); 2138+ uint8x16_t tmp = gray.val[0]; 2139+ gray.val[0] = gray.val[1]; 2140+ gray.val[1] = tmp; 2141+ vst2q_u8(rp, gray); 2142+ } 2143+#endif 2144 2145- for (i = 0; i < istop; i++, rp += 2) 2146+ for (; rp < rp_stop; rp += 2) 2147 { 2148 #ifdef PNG_BUILTIN_BSWAP16_SUPPORTED 2149 /* Feature added to libpng-1.6.11 for testing purposes, not 2150@@ -622,15 +664,25 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2151 2152 if ((row_info->color_type & PNG_COLOR_MASK_COLOR) != 0) 2153 { 2154- png_uint_32 row_width = row_info->width; 2155 if (row_info->bit_depth == 8) 2156 { 2157 if (row_info->color_type == PNG_COLOR_TYPE_RGB) 2158 { 2159- png_bytep rp; 2160- png_uint_32 i; 2161+ png_bytep rp = row; 2162+ png_bytep rp_stop = row + row_info->rowbytes; 2163 2164- for (i = 0, rp = row; i < row_width; i++, rp += 3) 2165+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE 2166+ png_bytep rp_stop_neon = row + row_info->rowbytes - 48; 2167+ for (; rp < rp_stop_neon; rp += 48) 2168+ { 2169+ uint8x16x3_t bgr = vld3q_u8(rp); 2170+ uint8x16_t tmp = bgr.val[2]; 2171+ bgr.val[2] = bgr.val[0]; 2172+ bgr.val[0] = tmp; 2173+ vst3q_u8(rp, bgr); 2174+ } 2175+#endif 2176+ for (; rp < rp_stop; rp += 3) 2177 { 2178 png_byte save = *rp; 2179 *rp = *(rp + 2); 2180@@ -640,10 +692,21 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2181 2182 else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA) 2183 { 2184- png_bytep rp; 2185- png_uint_32 i; 2186+ png_bytep rp = row; 2187+ png_bytep rp_stop = row + row_info->rowbytes; 2188 2189- for (i = 0, rp = row; i < row_width; i++, rp += 4) 2190+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE 2191+ png_bytep rp_stop_neon = row + row_info->rowbytes - 64; 2192+ for (; rp < rp_stop_neon; rp += 64) 2193+ { 2194+ uint8x16x4_t bgra = vld4q_u8(rp); 2195+ uint8x16_t tmp = bgra.val[2]; 2196+ bgra.val[2] = bgra.val[0]; 2197+ bgra.val[0] = tmp; 2198+ vst4q_u8(rp, bgra); 2199+ } 2200+#endif 2201+ for (; rp < rp_stop; rp += 4) 2202 { 2203 png_byte save = *rp; 2204 *rp = *(rp + 2); 2205@@ -657,10 +720,21 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2206 { 2207 if (row_info->color_type == PNG_COLOR_TYPE_RGB) 2208 { 2209- png_bytep rp; 2210- png_uint_32 i; 2211+ png_bytep rp = row; 2212+ png_bytep rp_stop = row + row_info->rowbytes; 2213 2214- for (i = 0, rp = row; i < row_width; i++, rp += 6) 2215+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE 2216+ png_bytep rp_stop_neon = row + row_info->rowbytes - 48; 2217+ for (; rp < rp_stop_neon; rp += 48) 2218+ { 2219+ uint16x8x3_t bgr = vld3q_u16((unsigned short *)rp); 2220+ uint16x8_t tmp = bgr.val[2]; 2221+ bgr.val[2] = bgr.val[0]; 2222+ bgr.val[0] = tmp; 2223+ vst3q_u16((unsigned short *)rp, bgr); 2224+ } 2225+#endif 2226+ for (; rp < rp_stop; rp += 6) 2227 { 2228 png_byte save = *rp; 2229 *rp = *(rp + 4); 2230@@ -673,10 +747,21 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2231 2232 else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA) 2233 { 2234- png_bytep rp; 2235- png_uint_32 i; 2236+ png_bytep rp = row; 2237+ png_bytep rp_stop = row + row_info->rowbytes; 2238 2239- for (i = 0, rp = row; i < row_width; i++, rp += 8) 2240+#ifdef PNG_ARM_NEON_INTRINSICS_AVAILABLE 2241+ png_bytep rp_stop_neon = row + row_info->rowbytes - 48; 2242+ for (; rp < rp_stop_neon; rp += 64) 2243+ { 2244+ uint16x8x4_t bgra = vld4q_u16((unsigned short *)rp); 2245+ uint16x8_t tmp = bgra.val[2]; 2246+ bgra.val[2] = bgra.val[0]; 2247+ bgra.val[0] = tmp; 2248+ vst4q_u16((unsigned short *)rp, bgra); 2249+ } 2250+#endif 2251+ for (; rp < rp_stop; rp += 8) 2252 { 2253 png_byte save = *rp; 2254 *rp = *(rp + 4); 2255