1diff --git a/arm/arm_init.c b/arm/arm_init.c 2index 3a89998ab..05aa2c0d9 100644 3--- a/arm/arm_init.c 4+++ b/arm/arm_init.c 5@@ -113,13 +113,23 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp) 6 * initialization function.) 7 */ 8 pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon; 9- 10+#ifdef PNG_MULTY_LINE_ENABLE 11+ // OH ISSUE: png optimize 12+ pp->read_filter[PNG_FILTER_VALUE_UP_X2-1] = png_read_filter_row_up_x2_neon; 13+#endif 14 if (bpp == 3) 15 { 16 pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon; 17 pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon; 18 pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = 19 png_read_filter_row_paeth3_neon; 20+#ifdef PNG_MULTY_LINE_ENABLE 21+ // OH ISSUE: png optimize 22+ pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] = 23+ png_read_filter_row_avg3_x2_neon; 24+ pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] = 25+ png_read_filter_row_paeth3_x2_neon; 26+#endif 27 } 28 29 else if (bpp == 4) 30@@ -128,6 +138,13 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp) 31 pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon; 32 pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = 33 png_read_filter_row_paeth4_neon; 34+#ifdef PNG_MULTY_LINE_ENABLE 35+ // OH ISSUE: png optimize 36+ pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] = 37+ png_read_filter_row_avg4_x2_neon; 38+ pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] = 39+ png_read_filter_row_paeth4_x2_neon; 40+#endif 41 } 42 } 43 #endif /* PNG_ARM_NEON_OPT > 0 */ 44diff --git a/arm/filter_neon_intrinsics.c b/arm/filter_neon_intrinsics.c 45index 4466d48b2..27048a578 100644 46--- a/arm/filter_neon_intrinsics.c 47+++ b/arm/filter_neon_intrinsics.c 48@@ -47,6 +47,7 @@ 49 50 #if PNG_ARM_NEON_OPT > 0 51 52+#ifndef PNG_MULTY_LINE_ENABLE 53 void 54 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, 55 png_const_bytep prev_row) 56@@ -396,7 +397,1351 @@ png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 57 vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0); 58 } 59 } 60+#else 61+// OH ISSUE: png optimize 62+// according to definition: row_info->rowbytes = row_width * row_info->channels, 63+// the input rowbytes must be 3 or 4 times the channel size, so: 64+// for RGB neon process 12 bytes at once,the tail must be 3,6,9; 65+// for RGBA neon process 16 or 8 bytes at once,the tail must be 4; 66+// filter operators are internal function, row_info and row ensure non empty outside. 67+#define STEP_RGB (12) // 3 channel RGB process 12 bytes at once 68+#define TAIL_RGB3 (9) // tail 3 pixels have 9 bytes 69+#define TAIL_RGB2 (6) // tail 2 pixels have 6 bytes 70+#define TAIL_RGB1 (3) // tail 1 pixel have 3 bytes 71+#define STEP_RGBA (16) // GBA neon process 16 bytes at once 72+#define STEP_RGBA_HALF (8) // GBA neon process 8 bytes at once 73+#define TAIL_RGBA (4) // tail 1 pixel have 4 bytes 74+#define IND3 (3) // index 3 75+#define IND2 (2) // index 2 76+#define OFFSET3 (3) // RGB offset 3 bytes 77+#define OFFSET6 (6) // RGB offset 6 bytes 78+void png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, 79+ png_const_bytep prev_row) 80+{ 81+ png_bytep rp = row; 82+ png_const_bytep pp = prev_row; 83+ int count = row_info->rowbytes; 84+ 85+ png_debug(1, "in png_read_filter_row_up_neon"); 86+ 87+ uint8x16_t qrp, qpp; 88+ while (count >= STEP_RGBA) { 89+ qrp = vld1q_u8(rp); 90+ qpp = vld1q_u8(pp); 91+ qrp = vaddq_u8(qrp, qpp); 92+ vst1q_u8(rp, qrp); 93+ rp += STEP_RGBA; 94+ pp += STEP_RGBA; 95+ count -= STEP_RGBA; 96+ } 97+ 98+ if (count >= STEP_RGBA_HALF) { 99+ uint8x8_t qrp1, qpp1; 100+ qrp1 = vld1_u8(rp); 101+ qpp1 = vld1_u8(pp); 102+ qrp1 = vadd_u8(qrp1, qpp1); 103+ vst1_u8(rp, qrp1); 104+ rp += STEP_RGBA_HALF; 105+ pp += STEP_RGBA_HALF; 106+ count -= STEP_RGBA_HALF; 107+ } 108+ 109+ for (int i = 0; i < count; i++) { 110+ *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 111+ rp++; 112+ } 113+} 114+ 115+void png_read_filter_row_up_x2_neon(png_row_infop row_info, png_bytep row, 116+ png_const_bytep prev_row) 117+{ 118+ png_bytep rp = row; 119+ png_const_bytep pp = prev_row; 120+ int count = row_info->rowbytes; 121+ png_bytep np = row + row_info->rowbytes + 1; 122+ 123+ png_debug(1, "in png_read_filter_row_up_x2_neon"); 124+ 125+ uint8x16_t qrp, qpp, qnp; 126+ while (count >= STEP_RGBA) { 127+ qrp = vld1q_u8(rp); 128+ qpp = vld1q_u8(pp); 129+ qnp = vld1q_u8(np); 130+ qrp = vaddq_u8(qrp, qpp); 131+ qnp = vaddq_u8(qnp, qrp); 132+ vst1q_u8(rp, qrp); 133+ vst1q_u8(np, qnp); 134+ rp += STEP_RGBA; 135+ pp += STEP_RGBA; 136+ np += STEP_RGBA; 137+ count -= STEP_RGBA; 138+ } 139+ 140+ if (count >= STEP_RGBA_HALF) { 141+ uint8x8_t qrp1, qpp1, qnp1; 142+ qrp1 = vld1_u8(rp); 143+ qpp1 = vld1_u8(pp); 144+ qnp1 = vld1_u8(np); 145+ qrp1 = vadd_u8(qrp1, qpp1); 146+ qnp1 = vadd_u8(qnp1, qrp1); 147+ vst1_u8(rp, qrp1); 148+ vst1_u8(np, qnp1); 149+ rp += STEP_RGBA_HALF; 150+ pp += STEP_RGBA_HALF; 151+ np += STEP_RGBA_HALF; 152+ count -= STEP_RGBA_HALF; 153+ } 154+ 155+ for (int i = 0; i < count; i++) { 156+ *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 157+ *np = (png_byte)(((int)(*np) + (int)(*rp++)) & 0xff); 158+ np++; 159+ } 160+} 161+ 162+void png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, 163+ png_const_bytep prev_row) 164+{ 165+ png_bytep rp = row; 166+ png_bytep rp_stop = row + row_info->rowbytes; 167+ 168+ uint8x16_t vtmp = vld1q_u8(rp); 169+ uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp); 170+ uint8x8x2_t vrp = *vrpt; 171+ 172+ uint8x8x4_t vdest; 173+ vdest.val[IND3] = vdup_n_u8(0); 174+ 175+ uint8x8_t vtmp1, vtmp2; 176+ uint32x2_t *temp_pointer; 177+ 178+ png_debug(1, "in png_read_filter_row_sub3_neon"); 179+ 180+ size_t tail_bytes = row_info->rowbytes % STEP_RGB; 181+ png_byte last_byte = *rp_stop; 182+ png_bytep rp_stop_new = rp_stop - tail_bytes; 183+ for (; rp < rp_stop_new;) 184+ { 185+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 186+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]); 187+ vtmp2 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 188+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 189+ 190+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 191+ vdest.val[IND2] = vadd_u8(vdest.val[1], vtmp2); 192+ vdest.val[IND3] = vadd_u8(vdest.val[IND2], vtmp1); 193+ 194+ vtmp = vld1q_u8(rp + STEP_RGB); 195+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 196+ vrp = *vrpt; 197+ 198+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 199+ rp += OFFSET3; 200+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 201+ rp += OFFSET3; 202+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 203+ rp += OFFSET3; 204+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0); 205+ rp += OFFSET3; 206+ } 207+ 208+ if (tail_bytes == TAIL_RGB1) { 209+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]); 210+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 211+ } else if (tail_bytes == TAIL_RGB2) { 212+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 213+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]); 214+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 215+ 216+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 217+ rp += OFFSET3; 218+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 219+ } else if (tail_bytes == TAIL_RGB3) { 220+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 221+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]); 222+ vtmp2 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 223+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 224+ vdest.val[IND2] = vadd_u8(vdest.val[1], vtmp2); 225+ 226+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 227+ rp += OFFSET3; 228+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 229+ rp += OFFSET3; 230+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 231+ } 232+ *rp_stop = last_byte; 233+ 234+ PNG_UNUSED(prev_row) 235+} 236+ 237+void png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, 238+ png_const_bytep prev_row) 239+{ 240+ png_bytep rp = row; 241+ int count = row_info->rowbytes; 242+ 243+ uint8x8x4_t vdest; 244+ vdest.val[IND3] = vdup_n_u8(0); 245+ 246+ png_debug(1, "in png_read_filter_row_sub4_neon"); 247+ 248+ uint32x2x4_t vtmp; 249+ uint8x8x4_t *vrpt; 250+ uint8x8x4_t vrp; 251+ uint32x2x4_t vdest_val; 252+ while (count >= STEP_RGBA) { 253+ uint32x2x4_t *temp_pointer; 254+ vtmp = vld4_u32(png_ptr(uint32_t, rp)); 255+ vrpt = png_ptr(uint8x8x4_t, &vtmp); 256+ vrp = *vrpt; 257+ 258+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]); 259+ vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]); 260+ vdest.val[IND2] = vadd_u8(vdest.val[1], vrp.val[IND2]); 261+ vdest.val[IND3] = vadd_u8(vdest.val[IND2], vrp.val[IND3]); 262+ 263+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 264+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0); 265+ 266+ rp += STEP_RGBA; 267+ count -= STEP_RGBA; 268+ } 269+ 270+ if (count >= STEP_RGBA_HALF) { 271+ uint32x2x2_t vtmp1 = vld2_u32(png_ptr(uint32_t, rp)); 272+ uint8x8x2_t *vrpt1 = png_ptr(uint8x8x2_t, &vtmp1); 273+ uint8x8x2_t vrp1 = *vrpt1; 274+ uint32x2x2_t *temp_pointer; 275+ uint32x2x2_t vdest_val1; 276+ 277+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp1.val[0]); 278+ vdest.val[1] = vadd_u8(vdest.val[0], vrp1.val[1]); 279+ vdest.val[IND3] = vdest.val[1]; 280+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 281+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0); 282+ 283+ rp += STEP_RGBA_HALF; 284+ count -= STEP_RGBA_HALF; 285+ } 286+ 287+ if (count == 0) { 288+ return; 289+ } 290+ 291+ uint32x2_t vtmp2 = vld1_u32(png_ptr(uint32_t, rp)); 292+ uint8x8_t *vrpt2 = png_ptr(uint8x8_t, &vtmp2); 293+ uint8x8_t vrp2 = *vrpt2; 294+ uint32x2_t *temp_pointer; 295+ uint32x2_t vdest_val2; 296+ 297+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp2); 298+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 299+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0); 300+ 301+ PNG_UNUSED(prev_row) 302+} 303+ 304+void png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, 305+ png_const_bytep prev_row) 306+{ 307+ png_bytep rp = row; 308+ png_const_bytep pp = prev_row; 309+ png_bytep rp_stop = row + row_info->rowbytes; 310+ 311+ uint8x16_t vtmp; 312+ uint8x8x2_t *vrpt; 313+ uint8x8x2_t vrp; 314+ uint8x8x4_t vdest; 315+ vdest.val[IND3] = vdup_n_u8(0); 316+ 317+ vtmp = vld1q_u8(rp); 318+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 319+ vrp = *vrpt; 320+ 321+ png_debug(1, "in png_read_filter_row_avg3_neon"); 322+ 323+ uint8x8_t vtmp1, vtmp2, vtmp3; 324+ uint8x8x2_t *vppt; 325+ uint8x8x2_t vpp; 326+ uint32x2_t *temp_pointer; 327+ 328+ size_t tail_bytes = row_info->rowbytes % STEP_RGB; 329+ png_byte last_byte = *rp_stop; 330+ png_bytep rp_stop_new = rp_stop - tail_bytes; 331+ for (; rp < rp_stop_new; pp += STEP_RGB) 332+ { 333+ vtmp = vld1q_u8(pp); 334+ vppt = png_ptr(uint8x8x2_t, &vtmp); 335+ vpp = *vppt; 336+ 337+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 338+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 339+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 340+ 341+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 342+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 343+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 344+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 345+ 346+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 347+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 348+ 349+ vtmp = vld1q_u8(rp + STEP_RGB); 350+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 351+ vrp = *vrpt; 352+ 353+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2); 354+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3); 355+ 356+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 357+ 358+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vtmp2); 359+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1); 360+ 361+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 362+ rp += OFFSET3; 363+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 364+ rp += OFFSET3; 365+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 366+ rp += OFFSET3; 367+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0); 368+ rp += OFFSET3; 369+ } 370+ 371+ vtmp = vld1q_u8(pp); 372+ vppt = png_ptr(uint8x8x2_t, &vtmp); 373+ vpp = *vppt; 374+ 375+ if (tail_bytes == TAIL_RGB1) { 376+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 377+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 378+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 379+ } else if (tail_bytes == TAIL_RGB2) { 380+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 381+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 382+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 383+ 384+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 385+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 386+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 387+ 388+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 389+ rp += OFFSET3; 390+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 391+ } else if (tail_bytes == TAIL_RGB3) { 392+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 393+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 394+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 395+ 396+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 397+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 398+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 399+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 400+ 401+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 402+ 403+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2); 404+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3); 405+ 406+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 407+ rp += OFFSET3; 408+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 409+ rp += OFFSET3; 410+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 411+ } 412+ *rp_stop = last_byte; 413+} 414+ 415+void png_read_filter_row_avg3_x2_neon(png_row_infop row_info, png_bytep row, 416+ png_const_bytep prev_row) 417+{ 418+ png_bytep rp = row; 419+ png_const_bytep pp = prev_row; 420+ png_bytep rp_stop = row + row_info->rowbytes; 421+ png_bytep np = rp_stop + 1; 422+ 423+ uint8x16_t vtmp; 424+ uint8x8x2_t *vrpt; 425+ uint8x8x2_t vrp; 426+ uint8x8x4_t vdest; 427+ vdest.val[IND3] = vdup_n_u8(0); 428+ 429+ vtmp = vld1q_u8(rp); 430+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 431+ vrp = *vrpt; 432+ 433+ uint8x8x2_t *vnpt; 434+ uint8x8x2_t vnp; 435+ uint8x8x4_t vdestN; 436+ vdestN.val[IND3] = vdup_n_u8(0); 437+ 438+ vtmp = vld1q_u8(np); 439+ vnpt = png_ptr(uint8x8x2_t, &vtmp); 440+ vnp = *vnpt; 441+ 442+ png_debug(1, "in png_read_filter_row_x2_avg3_neon"); 443+ 444+ uint8x8_t vtmp1, vtmp2, vtmp3; 445+ uint8x8x2_t *vppt; 446+ uint8x8x2_t vpp; 447+ uint32x2_t *temp_pointer; 448+ 449+ size_t tail_bytes = row_info->rowbytes % STEP_RGB; 450+ png_byte last_byte = *rp_stop; 451+ png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1); 452+ png_bytep rp_stop_new = rp_stop - tail_bytes; 453+ for (; rp < rp_stop_new; pp += STEP_RGB) 454+ { 455+ vtmp = vld1q_u8(pp); 456+ vppt = png_ptr(uint8x8x2_t, &vtmp); 457+ vpp = *vppt; 458+ 459+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 460+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 461+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 462+ 463+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 464+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 465+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 466+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 467+ 468+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 469+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 470+ 471+ vtmp = vld1q_u8(rp + STEP_RGB); 472+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 473+ vrp = *vrpt; 474+ 475+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2); 476+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3); 477+ 478+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 479+ 480+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vtmp2); 481+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1); 482+ 483+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 484+ rp += OFFSET3; 485+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 486+ rp += OFFSET3; 487+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 488+ rp += OFFSET3; 489+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0); 490+ rp += OFFSET3; 491+ 492+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 493+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 494+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 495+ 496+ vtmp3 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6); 497+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 498+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 499+ 500+ vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1); 501+ 502+ vtmp = vld1q_u8(np + STEP_RGB); 503+ vnpt = png_ptr(uint8x8x2_t, &vtmp); 504+ vnp = *vnpt; 505+ 506+ vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]); 507+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp3); 508+ 509+ vdestN.val[IND3] = vhadd_u8(vdestN.val[IND2], vdest.val[IND3]); 510+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vtmp1); 511+ 512+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 513+ np += OFFSET3; 514+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 515+ np += OFFSET3; 516+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0); 517+ np += OFFSET3; 518+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND3]), 0); 519+ np += OFFSET3; 520+ } 521+ 522+ vtmp = vld1q_u8(pp); 523+ vppt = png_ptr(uint8x8x2_t, &vtmp); 524+ vpp = *vppt; 525+ 526+ if (tail_bytes == TAIL_RGB1) { 527+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 528+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 529+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 530+ 531+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 532+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 533+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 534+ } else if (tail_bytes == TAIL_RGB2) { 535+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 536+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 537+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 538+ 539+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 540+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 541+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 542+ 543+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 544+ rp += OFFSET3; 545+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 546+ 547+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 548+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 549+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 550+ 551+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 552+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 553+ 554+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 555+ np += OFFSET3; 556+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 557+ } else if (tail_bytes == TAIL_RGB3) { 558+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 559+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 560+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 561+ 562+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 563+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 564+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 565+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 566+ 567+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 568+ 569+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2); 570+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3); 571+ 572+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 573+ rp += OFFSET3; 574+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 575+ rp += OFFSET3; 576+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 577+ 578+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 579+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 580+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 581+ 582+ vtmp3 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6); 583+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 584+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 585+ 586+ vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]); 587+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp3); 588+ 589+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 590+ np += OFFSET3; 591+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 592+ np += OFFSET3; 593+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0); 594+ } 595+ *rp_stop = last_byte; 596+ *(rp_stop + row_info->rowbytes + 1) = last_byte_next; 597+} 598+ 599+void png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, 600+ png_const_bytep prev_row) 601+{ 602+ png_bytep rp = row; 603+ png_const_bytep pp = prev_row; 604+ int count = row_info->rowbytes; 605+ 606+ uint8x8x4_t vdest; 607+ vdest.val[IND3] = vdup_n_u8(0); 608+ 609+ png_debug(1, "in png_read_filter_row_avg4_neon"); 610+ 611+ uint32x2x4_t vtmp; 612+ uint8x8x4_t *vrpt, *vppt; 613+ uint8x8x4_t vrp, vpp; 614+ uint32x2x4_t vdest_val; 615+ while (count >= STEP_RGBA) { 616+ uint32x2x4_t *temp_pointer; 617+ vtmp = vld4_u32(png_ptr(uint32_t, rp)); 618+ vrpt = png_ptr(uint8x8x4_t, &vtmp); 619+ vrp = *vrpt; 620+ vtmp = vld4_u32(png_ptrc(uint32_t, pp)); 621+ vppt = png_ptr(uint8x8x4_t, &vtmp); 622+ vpp = *vppt; 623+ 624+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 625+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 626+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); 627+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 628+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vpp.val[IND2]); 629+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]); 630+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vpp.val[IND3]); 631+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]); 632+ 633+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 634+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0); 635+ 636+ rp += STEP_RGBA; 637+ pp += STEP_RGBA; 638+ count -= STEP_RGBA; 639+ } 640+ 641+ if (count >= STEP_RGBA_HALF) { 642+ uint32x2x2_t vtmp1; 643+ uint8x8x2_t *vrpt1, *vppt1; 644+ uint8x8x2_t vrp1, vpp1; 645+ uint32x2x2_t *temp_pointer; 646+ uint32x2x2_t vdest_val1; 647+ 648+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp)); 649+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1); 650+ vrp1 = *vrpt1; 651+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp)); 652+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1); 653+ vpp1 = *vppt1; 654+ 655+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp1.val[0]); 656+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 657+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]); 658+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 659+ vdest.val[IND3] = vdest.val[1]; 660+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 661+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0); 662+ 663+ rp += STEP_RGBA_HALF; 664+ pp += STEP_RGBA_HALF; 665+ count -= STEP_RGBA_HALF; 666+ } 667+ 668+ if (count == 0) { 669+ return; 670+ } 671+ 672+ uint32x2_t vtmp2; 673+ uint8x8_t *vrpt2, *vppt2; 674+ uint8x8_t vrp2, vpp2; 675+ uint32x2_t *temp_pointer; 676+ uint32x2_t vdest_val2; 677+ 678+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp)); 679+ vrpt2 = png_ptr(uint8x8_t, &vtmp2); 680+ vrp2 = *vrpt2; 681+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp)); 682+ vppt2 = png_ptr(uint8x8_t, &vtmp2); 683+ vpp2 = *vppt2; 684+ 685+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp2); 686+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 687+ 688+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 689+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0); 690+} 691 692+void png_read_filter_row_avg4_x2_neon(png_row_infop row_info, png_bytep row, 693+ png_const_bytep prev_row) 694+{ 695+ png_bytep rp = row; 696+ png_const_bytep pp = prev_row; 697+ int count = row_info->rowbytes; 698+ png_bytep np = row + count + 1; 699+ 700+ uint8x8x4_t vdest; 701+ vdest.val[IND3] = vdup_n_u8(0); 702+ 703+ png_debug(1, "in png_read_filter_row_avg4_x2_neon"); 704+ 705+ uint32x2x4_t vtmp; 706+ uint8x8x4_t *vrpt, *vppt; 707+ uint8x8x4_t vrp, vpp; 708+ uint32x2x4_t vdest_val; 709+ 710+ uint8x8x4_t *vnpt; 711+ uint8x8x4_t vnp; 712+ uint8x8x4_t vdestN; 713+ vdestN.val[IND3] = vdup_n_u8(0); 714+ 715+ while (count >= STEP_RGBA) { 716+ uint32x2x4_t *temp_pointer; 717+ vtmp = vld4_u32(png_ptr(uint32_t, rp)); 718+ vrpt = png_ptr(uint8x8x4_t, &vtmp); 719+ vrp = *vrpt; 720+ vtmp = vld4_u32(png_ptrc(uint32_t, pp)); 721+ vppt = png_ptr(uint8x8x4_t, &vtmp); 722+ vpp = *vppt; 723+ vtmp = vld4_u32(png_ptrc(uint32_t, np)); 724+ vnpt = png_ptr(uint8x8x4_t, &vtmp); 725+ vnp = *vnpt; 726+ 727+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 728+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 729+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); 730+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 731+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vpp.val[IND2]); 732+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]); 733+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vpp.val[IND3]); 734+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]); 735+ 736+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 737+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0); 738+ 739+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 740+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 741+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 742+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]); 743+ vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]); 744+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vnp.val[IND2]); 745+ vdestN.val[IND3] = vhadd_u8(vdestN.val[IND2], vdest.val[IND3]); 746+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vnp.val[IND3]); 747+ 748+ vdest_val = png_ldr(uint32x2x4_t, &vdestN); 749+ vst4_lane_u32(png_ptr(uint32_t, np), vdest_val, 0); 750+ 751+ rp += STEP_RGBA; 752+ pp += STEP_RGBA; 753+ np += STEP_RGBA; 754+ count -= STEP_RGBA; 755+ } 756+ 757+ if (count >= STEP_RGBA_HALF) { 758+ uint32x2x2_t vtmp1; 759+ uint8x8x2_t *vrpt1, *vppt1, *vnpt1; 760+ uint8x8x2_t vrp1, vpp1, vnp1; 761+ uint32x2x2_t *temp_pointer; 762+ uint32x2x2_t vdest_val1; 763+ 764+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp)); 765+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1); 766+ vrp1 = *vrpt1; 767+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp)); 768+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1); 769+ vpp1 = *vppt1; 770+ vtmp1 = vld2_u32(png_ptrc(uint32_t, np)); 771+ vnpt1 = png_ptr(uint8x8x2_t, &vtmp1); 772+ vnp1 = *vnpt1; 773+ 774+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp1.val[0]); 775+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 776+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]); 777+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 778+ vdest.val[IND3] = vdest.val[1]; 779+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 780+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0); 781+ 782+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 783+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]); 784+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 785+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]); 786+ vdestN.val[IND3] = vdestN.val[1]; 787+ vdest_val1 = png_ldr(uint32x2x2_t, &vdestN); 788+ vst2_lane_u32(png_ptr(uint32_t, np), vdest_val1, 0); 789+ 790+ rp += STEP_RGBA_HALF; 791+ pp += STEP_RGBA_HALF; 792+ np += STEP_RGBA_HALF; 793+ count -= STEP_RGBA_HALF; 794+ } 795+ 796+ if (count == 0) { 797+ return; 798+ } 799+ 800+ uint32x2_t vtmp2; 801+ uint8x8_t *vrpt2, *vppt2, *vnpt2; 802+ uint8x8_t vrp2, vpp2, vnp2; 803+ uint32x2_t *temp_pointer; 804+ uint32x2_t vdest_val2; 805+ 806+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp)); 807+ vrpt2 = png_ptr(uint8x8_t, &vtmp2); 808+ vrp2 = *vrpt2; 809+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp)); 810+ vppt2 = png_ptr(uint8x8_t, &vtmp2); 811+ vpp2 = *vppt2; 812+ vtmp2 = vld1_u32(png_ptrc(uint32_t, np)); 813+ vnpt2 = png_ptr(uint8x8_t, &vtmp2); 814+ vnp2 = *vnpt2; 815+ 816+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp2); 817+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 818+ 819+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 820+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0); 821+ 822+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 823+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2); 824+ 825+ vdest_val2 = png_ldr(uint32x2_t, &vdestN); 826+ vst1_lane_u32(png_ptr(uint32_t, np), vdest_val2, 0); 827+} 828+ 829+static uint8x8_t paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c) 830+{ 831+ uint8x8_t d, e; 832+ uint16x8_t p1, pa, pb, pc; 833+ 834+ p1 = vaddl_u8(a, b); /* a + b */ 835+ pc = vaddl_u8(c, c); /* c * 2 */ 836+ pa = vabdl_u8(b, c); /* pa */ 837+ pb = vabdl_u8(a, c); /* pb */ 838+ pc = vabdq_u16(p1, pc); /* pc */ 839+ 840+ p1 = vcleq_u16(pa, pb); /* pa <= pb */ 841+ pa = vcleq_u16(pa, pc); /* pa <= pc */ 842+ pb = vcleq_u16(pb, pc); /* pb <= pc */ 843+ 844+ p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */ 845+ 846+ d = vmovn_u16(pb); 847+ e = vmovn_u16(p1); 848+ 849+ d = vbsl_u8(d, b, c); 850+ e = vbsl_u8(e, a, d); 851+ 852+ return e; 853+} 854+ 855+void png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, 856+ png_const_bytep prev_row) 857+{ 858+ png_bytep rp = row; 859+ png_const_bytep pp = prev_row; 860+ png_bytep rp_stop = row + row_info->rowbytes; 861+ 862+ uint8x16_t vtmp; 863+ uint8x8x2_t *vrpt; 864+ uint8x8x2_t vrp; 865+ uint8x8_t vlast = vdup_n_u8(0); 866+ uint8x8x4_t vdest; 867+ vdest.val[IND3] = vdup_n_u8(0); 868+ 869+ vtmp = vld1q_u8(rp); 870+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 871+ vrp = *vrpt; 872+ 873+ uint8x8x2_t *vppt; 874+ uint8x8x2_t vpp; 875+ uint8x8_t vtmp1, vtmp2, vtmp3; 876+ uint32x2_t *temp_pointer; 877+ 878+ png_debug(1, "in png_read_filter_row_paeth3_neon"); 879+ 880+ size_t tail_bytes = row_info->rowbytes % STEP_RGB; 881+ png_byte last_byte = *rp_stop; 882+ png_bytep rp_stop_new = rp_stop - tail_bytes; 883+ for (; rp < rp_stop_new; pp += STEP_RGB) 884+ { 885+ vtmp = vld1q_u8(pp); 886+ vppt = png_ptr(uint8x8x2_t, &vtmp); 887+ vpp = *vppt; 888+ 889+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 890+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 891+ 892+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 893+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 894+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 895+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 896+ 897+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 898+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 899+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2); 900+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1); 901+ 902+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 903+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 904+ 905+ vtmp = vld1q_u8(rp + STEP_RGB); 906+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 907+ vrp = *vrpt; 908+ 909+ vdest.val[IND3] = paeth(vdest.val[IND2], vtmp2, vtmp3); 910+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1); 911+ 912+ vlast = vtmp2; 913+ 914+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 915+ rp += OFFSET3; 916+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 917+ rp += OFFSET3; 918+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 919+ rp += OFFSET3; 920+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0); 921+ rp += OFFSET3; 922+ } 923+ 924+ vtmp = vld1q_u8(pp); 925+ vppt = png_ptr(uint8x8x2_t, &vtmp); 926+ vpp = *vppt; 927+ 928+ if (tail_bytes == TAIL_RGB1) { 929+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 930+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 931+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 932+ } else if (tail_bytes == TAIL_RGB2) { 933+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 934+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 935+ 936+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 937+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 938+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 939+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 940+ 941+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 942+ rp += OFFSET3; 943+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 944+ } else if (tail_bytes == TAIL_RGB3) { 945+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 946+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 947+ 948+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 949+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 950+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 951+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 952+ 953+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 954+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 955+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2); 956+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1); 957+ 958+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 959+ rp += OFFSET3; 960+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 961+ rp += OFFSET3; 962+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 963+ } 964+ *rp_stop = last_byte; 965+} 966+ 967+void png_read_filter_row_paeth3_x2_neon(png_row_infop row_info, png_bytep row, 968+ png_const_bytep prev_row) 969+{ 970+ png_bytep rp = row; 971+ png_const_bytep pp = prev_row; 972+ png_bytep rp_stop = row + row_info->rowbytes; 973+ png_bytep np = rp_stop + 1; 974+ 975+ uint8x16_t vtmp; 976+ uint8x8x2_t *vrpt; 977+ uint8x8x2_t vrp; 978+ uint8x8_t vlast = vdup_n_u8(0); 979+ uint8x8x4_t vdest; 980+ vdest.val[IND3] = vdup_n_u8(0); 981+ 982+ vtmp = vld1q_u8(rp); 983+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 984+ vrp = *vrpt; 985+ 986+ uint8x8x2_t *vppt; 987+ uint8x8x2_t vpp; 988+ uint8x8_t vtmp1, vtmp2, vtmp3; 989+ uint32x2_t *temp_pointer; 990+ 991+ uint8x8x2_t *vnpt; 992+ uint8x8x2_t vnp; 993+ uint8x8_t vlastN = vdup_n_u8(0); 994+ uint8x8x4_t vdestN; 995+ vdestN.val[IND3] = vdup_n_u8(0); 996+ 997+ vtmp = vld1q_u8(np); 998+ vnpt = png_ptr(uint8x8x2_t, &vtmp); 999+ vnp = *vnpt; 1000+ 1001+ png_debug(1, "in png_read_filter_row_paeth3_x2_neon"); 1002+ 1003+ size_t tail_bytes = row_info->rowbytes % STEP_RGB; 1004+ png_byte last_byte = *rp_stop; 1005+ png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1); 1006+ png_bytep rp_stop_new = rp_stop - tail_bytes; 1007+ 1008+ for (; rp < rp_stop_new; pp += STEP_RGB) 1009+ { 1010+ vtmp = vld1q_u8(pp); 1011+ vppt = png_ptr(uint8x8x2_t, &vtmp); 1012+ vpp = *vppt; 1013+ 1014+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1015+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1016+ 1017+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 1018+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 1019+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 1020+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 1021+ 1022+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 1023+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 1024+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2); 1025+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1); 1026+ 1027+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 1028+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 1029+ 1030+ vtmp = vld1q_u8(rp + STEP_RGB); 1031+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 1032+ vrp = *vrpt; 1033+ 1034+ vdest.val[IND3] = paeth(vdest.val[IND2], vtmp2, vtmp3); 1035+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1); 1036+ 1037+ vlast = vtmp2; 1038+ 1039+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 1040+ rp += OFFSET3; 1041+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 1042+ rp += OFFSET3; 1043+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 1044+ rp += OFFSET3; 1045+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0); 1046+ rp += OFFSET3; 1047+ 1048+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1049+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1050+ 1051+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 1052+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1053+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 1054+ 1055+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6); 1056+ vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]); 1057+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp1); 1058+ 1059+ vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1); 1060+ 1061+ vtmp = vld1q_u8(np + STEP_RGB); 1062+ vnpt = png_ptr(uint8x8x2_t, &vtmp); 1063+ vnp = *vnpt; 1064+ 1065+ vdestN.val[IND3] = paeth(vdestN.val[IND2], vdest.val[IND3], vdest.val[IND2]); 1066+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vtmp1); 1067+ 1068+ vlastN = vdest.val[IND3]; 1069+ 1070+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 1071+ np += OFFSET3; 1072+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 1073+ np += OFFSET3; 1074+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0); 1075+ np += OFFSET3; 1076+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND3]), 0); 1077+ np += OFFSET3; 1078+ } 1079+ 1080+ vtmp = vld1q_u8(pp); 1081+ vppt = png_ptr(uint8x8x2_t, &vtmp); 1082+ vpp = *vppt; 1083+ 1084+ if (tail_bytes == TAIL_RGB1) { 1085+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1086+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1087+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 1088+ 1089+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1090+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1091+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 1092+ } else if (tail_bytes == TAIL_RGB2) { 1093+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1094+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1095+ 1096+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 1097+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 1098+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 1099+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 1100+ 1101+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 1102+ rp += OFFSET3; 1103+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 1104+ 1105+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1106+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1107+ 1108+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 1109+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1110+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 1111+ 1112+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 1113+ np += OFFSET3; 1114+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 1115+ } else if (tail_bytes == TAIL_RGB3) { 1116+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1117+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1118+ 1119+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 1120+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 1121+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 1122+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 1123+ 1124+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 1125+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 1126+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2); 1127+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1); 1128+ 1129+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 1130+ rp += OFFSET3; 1131+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 1132+ rp += OFFSET3; 1133+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 1134+ 1135+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1136+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1137+ 1138+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 1139+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1140+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 1141+ 1142+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6); 1143+ vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]); 1144+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp1); 1145+ 1146+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 1147+ np += OFFSET3; 1148+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 1149+ np += OFFSET3; 1150+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0); 1151+ } 1152+ *rp_stop = last_byte; 1153+ *(rp_stop + row_info->rowbytes + 1) = last_byte_next; 1154+} 1155+ 1156+void png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 1157+ png_const_bytep prev_row) 1158+{ 1159+ png_bytep rp = row; 1160+ int count = row_info->rowbytes; 1161+ png_const_bytep pp = prev_row; 1162+ 1163+ uint8x8_t vlast = vdup_n_u8(0); 1164+ uint8x8x4_t vdest; 1165+ vdest.val[IND3] = vdup_n_u8(0); 1166+ 1167+ png_debug(1, "in png_read_filter_row_paeth4_neon"); 1168+ 1169+ uint32x2x4_t vtmp; 1170+ uint8x8x4_t *vrpt, *vppt; 1171+ uint8x8x4_t vrp, vpp; 1172+ uint32x2x4_t vdest_val; 1173+ while (count >= STEP_RGBA) { 1174+ uint32x2x4_t *temp_pointer; 1175+ vtmp = vld4_u32(png_ptr(uint32_t, rp)); 1176+ vrpt = png_ptr(uint8x8x4_t, &vtmp); 1177+ vrp = *vrpt; 1178+ vtmp = vld4_u32(png_ptrc(uint32_t, pp)); 1179+ vppt = png_ptr(uint8x8x4_t, &vtmp); 1180+ vpp = *vppt; 1181+ 1182+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1183+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1184+ vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); 1185+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 1186+ vdest.val[IND2] = paeth(vdest.val[1], vpp.val[IND2], vpp.val[1]); 1187+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]); 1188+ vdest.val[IND3] = paeth(vdest.val[IND2], vpp.val[IND3], vpp.val[IND2]); 1189+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]); 1190+ 1191+ vlast = vpp.val[IND3]; 1192+ 1193+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 1194+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0); 1195+ 1196+ rp += STEP_RGBA; 1197+ pp += STEP_RGBA; 1198+ count -= STEP_RGBA; 1199+ } 1200+ 1201+ if (count >= STEP_RGBA_HALF) { 1202+ uint32x2x2_t vtmp1; 1203+ uint8x8x2_t *vrpt1, *vppt1; 1204+ uint8x8x2_t vrp1, vpp1; 1205+ uint32x2x2_t *temp_pointer; 1206+ uint32x2x2_t vdest_val1; 1207+ 1208+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp)); 1209+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1); 1210+ vrp1 = *vrpt1; 1211+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp)); 1212+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1); 1213+ vpp1 = *vppt1; 1214+ 1215+ vdest.val[0] = paeth(vdest.val[IND3], vpp1.val[0], vlast); 1216+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 1217+ vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]); 1218+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 1219+ vlast = vpp1.val[1]; 1220+ 1221+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 1222+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0); 1223+ vdest.val[IND3] = vdest.val[1]; 1224+ 1225+ rp += STEP_RGBA_HALF; 1226+ pp += STEP_RGBA_HALF; 1227+ count -= STEP_RGBA_HALF; 1228+ } 1229+ 1230+ if (count == 0) { 1231+ return; 1232+ } 1233+ 1234+ uint32x2_t vtmp2; 1235+ uint8x8_t *vrpt2, *vppt2; 1236+ uint8x8_t vrp2, vpp2; 1237+ uint32x2_t *temp_pointer; 1238+ uint32x2_t vdest_val2; 1239+ 1240+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp)); 1241+ vrpt2 = png_ptr(uint8x8_t, &vtmp2); 1242+ vrp2 = *vrpt2; 1243+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp)); 1244+ vppt2 = png_ptr(uint8x8_t, &vtmp2); 1245+ vpp2 = *vppt2; 1246+ 1247+ vdest.val[0] = paeth(vdest.val[IND3], vpp2, vlast); 1248+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 1249+ 1250+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 1251+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0); 1252+} 1253+ 1254+void png_read_filter_row_paeth4_x2_neon(png_row_infop row_info, png_bytep row, 1255+ png_const_bytep prev_row) 1256+{ 1257+ png_bytep rp = row; 1258+ int count = row_info->rowbytes; 1259+ png_const_bytep pp = prev_row; 1260+ png_bytep np = row + row_info->rowbytes + 1; 1261+ 1262+ uint8x8_t vlast = vdup_n_u8(0); 1263+ uint8x8x4_t vdest; 1264+ vdest.val[IND3] = vdup_n_u8(0); 1265+ 1266+ png_debug(1, "in png_read_filter_row_paeth4_x2_neon"); 1267+ 1268+ uint32x2x4_t vtmp; 1269+ uint8x8x4_t *vrpt, *vppt; 1270+ uint8x8x4_t vrp, vpp; 1271+ uint32x2x4_t vdest_val; 1272+ 1273+ uint8x8x4_t *vnpt; 1274+ uint8x8x4_t vnp; 1275+ uint8x8_t vlastN = vdup_n_u8(0); 1276+ uint8x8x4_t vdestN; 1277+ vdestN.val[IND3] = vdup_n_u8(0); 1278+ 1279+ while (count >= STEP_RGBA) { 1280+ uint32x2x4_t *temp_pointer; 1281+ vtmp = vld4_u32(png_ptr(uint32_t, rp)); 1282+ vrpt = png_ptr(uint8x8x4_t, &vtmp); 1283+ vrp = *vrpt; 1284+ vtmp = vld4_u32(png_ptrc(uint32_t, pp)); 1285+ vppt = png_ptr(uint8x8x4_t, &vtmp); 1286+ vpp = *vppt; 1287+ vtmp = vld4_u32(png_ptrc(uint32_t, np)); 1288+ vnpt = png_ptr(uint8x8x4_t, &vtmp); 1289+ vnp = *vnpt; 1290+ 1291+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1292+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1293+ vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); 1294+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 1295+ vdest.val[IND2] = paeth(vdest.val[1], vpp.val[IND2], vpp.val[1]); 1296+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]); 1297+ vdest.val[IND3] = paeth(vdest.val[IND2], vpp.val[IND3], vpp.val[IND2]); 1298+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]); 1299+ 1300+ vlast = vpp.val[IND3]; 1301+ 1302+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 1303+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0); 1304+ 1305+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1306+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1307+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1308+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]); 1309+ vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]); 1310+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vnp.val[IND2]); 1311+ vdestN.val[IND3] = paeth(vdestN.val[IND2], vdest.val[IND3], vdest.val[IND2]); 1312+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vnp.val[IND3]); 1313+ 1314+ vlastN = vdest.val[IND3]; 1315+ 1316+ vdest_val = png_ldr(uint32x2x4_t, &vdestN); 1317+ vst4_lane_u32(png_ptr(uint32_t, np), vdest_val, 0); 1318+ 1319+ rp += STEP_RGBA; 1320+ pp += STEP_RGBA; 1321+ np += STEP_RGBA; 1322+ count -= STEP_RGBA; 1323+ } 1324+ 1325+ if (count >= STEP_RGBA_HALF) { 1326+ uint32x2x2_t vtmp1; 1327+ uint8x8x2_t *vrpt1, *vppt1, *vnpt1; 1328+ uint8x8x2_t vrp1, vpp1, vnp1; 1329+ uint32x2x2_t *temp_pointer; 1330+ uint32x2x2_t vdest_val1; 1331+ 1332+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp)); 1333+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1); 1334+ vrp1 = *vrpt1; 1335+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp)); 1336+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1); 1337+ vpp1 = *vppt1; 1338+ vtmp1 = vld2_u32(png_ptrc(uint32_t, np)); 1339+ vnpt1 = png_ptr(uint8x8x2_t, &vtmp1); 1340+ vnp1 = *vnpt1; 1341+ 1342+ vdest.val[0] = paeth(vdest.val[IND3], vpp1.val[0], vlast); 1343+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 1344+ vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]); 1345+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 1346+ 1347+ vlast = vpp1.val[1]; 1348+ 1349+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 1350+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0); 1351+ 1352+ vdest.val[IND3] = vdest.val[1]; 1353+ 1354+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1355+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]); 1356+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1357+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]); 1358+ 1359+ vlastN = vdest.val[1]; 1360+ 1361+ vdest_val1 = png_ldr(uint32x2x2_t, &vdestN); 1362+ vst2_lane_u32(png_ptr(uint32_t, np), vdest_val1, 0); 1363+ 1364+ vdestN.val[IND3] = vdestN.val[1]; 1365+ 1366+ rp += STEP_RGBA_HALF; 1367+ pp += STEP_RGBA_HALF; 1368+ np += STEP_RGBA_HALF; 1369+ count -= STEP_RGBA_HALF; 1370+ } 1371+ 1372+ if (count == 0) { 1373+ return; 1374+ } 1375+ 1376+ uint32x2_t vtmp2; 1377+ uint8x8_t *vrpt2, *vppt2, *vnpt2; 1378+ uint8x8_t vrp2, vpp2, vnp2; 1379+ uint32x2_t *temp_pointer; 1380+ uint32x2_t vdest_val2; 1381+ 1382+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp)); 1383+ vrpt2 = png_ptr(uint8x8_t, &vtmp2); 1384+ vrp2 = *vrpt2; 1385+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp)); 1386+ vppt2 = png_ptr(uint8x8_t, &vtmp2); 1387+ vpp2 = *vppt2; 1388+ vtmp2 = vld1_u32(png_ptrc(uint32_t, np)); 1389+ vnpt2 = png_ptr(uint8x8_t, &vtmp2); 1390+ vnp2 = *vnpt2; 1391+ 1392+ vdest.val[0] = paeth(vdest.val[IND3], vpp2, vlast); 1393+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 1394+ 1395+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 1396+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0); 1397+ 1398+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1399+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2); 1400+ 1401+ vdest_val2 = png_ldr(uint32x2_t, &vdestN); 1402+ vst1_lane_u32(png_ptr(uint32_t, np), vdest_val2, 0); 1403+} 1404+#endif /* PNG_MULTY_LINE_ENABLE */ 1405 #endif /* PNG_ARM_NEON_OPT > 0 */ 1406 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */ 1407 #endif /* READ */ 1408diff --git a/pngpread.c b/pngpread.c 1409index e283627b7..43ec512df 100644 1410--- a/pngpread.c 1411+++ b/pngpread.c 1412@@ -264,9 +264,22 @@ png_push_read_chunk(png_structrp png_ptr, png_inforp info_ptr) 1413 png_ptr->idat_size = png_ptr->push_length; 1414 png_ptr->process_mode = PNG_READ_IDAT_MODE; 1415 png_push_have_info(png_ptr, info_ptr); 1416- png_ptr->zstream.avail_out = 1417- (uInt) PNG_ROWBYTES(png_ptr->pixel_depth, 1418- png_ptr->iwidth) + 1; 1419+#ifdef PNG_MULTY_LINE_ENABLE 1420+ // OH ISSUE: png optimize 1421+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1422+ (png_ptr->transformations & PNG_CHECK) == 0) { 1423+ int rest = png_ptr->num_rows - png_ptr->row_number; 1424+ int row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS; 1425+ png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth, 1426+ png_ptr->iwidth) + 1) * row_num; 1427+ } 1428+ else 1429+#endif 1430+ { 1431+ png_ptr->zstream.avail_out = 1432+ (uInt) PNG_ROWBYTES(png_ptr->pixel_depth, 1433+ png_ptr->iwidth) + 1; 1434+ } 1435 png_ptr->zstream.next_out = png_ptr->row_buf; 1436 return; 1437 } 1438@@ -623,6 +636,92 @@ png_push_read_IDAT(png_structrp png_ptr) 1439 } 1440 } 1441 1442+#ifdef PNG_MULTY_LINE_ENABLE 1443+// OH ISSUE: png optimize 1444+static void png_push_process_row_x2(png_structrp png_ptr, 1445+ png_row_info row_info_in) 1446+{ 1447+ png_debug(1, "in png_push_process_row_x2"); 1448+ png_row_info row_info = row_info_in; 1449+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, 1450+ png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4); 1451+ 1452+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1453+ if (png_ptr->transformations != 0) 1454+ png_do_read_transformations(png_ptr, &row_info); 1455+#endif 1456+ 1457+ if (png_ptr->transformed_pixel_depth == 0) 1458+ { 1459+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1460+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1461+ png_error(png_ptr, "progressive row overflow"); 1462+ } 1463+ 1464+ png_push_have_row(png_ptr, png_ptr->row_buf + 1); 1465+ png_read_push_finish_row(png_ptr); 1466+ 1467+ png_ptr->row_buf = png_ptr->row_buf + png_ptr->rowbytes + 1; 1468+ 1469+ // do it again 1470+ if (png_ptr->transformations != 0) 1471+ { 1472+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1473+ } 1474+ else 1475+ { 1476+ png_ptr->prev_row = png_ptr->row_buf; 1477+ } 1478+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1479+ if (png_ptr->transformations != 0) 1480+ png_do_read_transformations(png_ptr, &row_info); 1481+#endif 1482+ 1483+ png_push_have_row(png_ptr, png_ptr->row_buf + 1); 1484+ png_read_push_finish_row(png_ptr); 1485+} 1486+ 1487+static void png_push_process_multi_rows(png_structrp png_ptr, int row_num) 1488+{ 1489+ png_debug(1, "in png_push_process_multi_rows"); 1490+ uInt row_bytes = png_ptr->rowbytes + 1; 1491+ 1492+ png_row_info row_info; 1493+ row_info.width = png_ptr->iwidth; 1494+ row_info.color_type = png_ptr->color_type; 1495+ row_info.bit_depth = png_ptr->bit_depth; 1496+ row_info.channels = png_ptr->channels; 1497+ row_info.pixel_depth = png_ptr->pixel_depth; 1498+ row_info.rowbytes = png_ptr->rowbytes; 1499+ 1500+ png_bytep temp_row = png_ptr->row_buf; 1501+ png_bytep temp_prev_row = png_ptr->prev_row; 1502+ 1503+ for (int i = 0; i < row_num; i++) { 1504+ // check if the x2_filter is effective: only supports channels 3 or 4 1505+ if ((png_ptr->channels == 3 || png_ptr->channels == 4) && 1506+ i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB && 1507+ png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST && 1508+ png_ptr->row_buf[0] == png_ptr->row_buf[row_bytes]) 1509+ { 1510+ png_push_process_row_x2(png_ptr, row_info); 1511+ png_ptr->row_buf = png_ptr->row_buf + row_bytes; 1512+ i++; 1513+ continue; 1514+ } 1515+ png_push_process_row(png_ptr); 1516+ png_ptr->row_buf = png_ptr->row_buf + row_bytes; 1517+ } 1518+ 1519+ if (png_ptr->transformations == 0 && png_ptr->interlaced == 0) 1520+ { 1521+ png_ptr->prev_row = temp_prev_row; 1522+ memcpy(png_ptr->prev_row, png_ptr->row_buf - row_bytes, row_bytes); 1523+ } 1524+ png_ptr->row_buf = temp_row; 1525+} 1526+#endif 1527+ 1528 void /* PRIVATE */ 1529 png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1530 size_t buffer_length) 1531@@ -639,6 +738,17 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1532 /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */ 1533 png_ptr->zstream.avail_in = (uInt)buffer_length; 1534 1535+#ifdef PNG_MULTY_LINE_ENABLE 1536+ // OH ISSUE: png optimize 1537+ int row_num = 1; 1538+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1539+ (png_ptr->transformations & PNG_CHECK) == 0) 1540+ { 1541+ int rest = png_ptr->num_rows - png_ptr->row_number; 1542+ row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS; 1543+ } 1544+#endif 1545+ 1546 /* Keep going until the decompressed data is all processed 1547 * or the stream marked as finished. 1548 */ 1549@@ -655,9 +765,20 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1550 if (!(png_ptr->zstream.avail_out > 0)) 1551 { 1552 /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */ 1553+#ifdef PNG_MULTY_LINE_ENABLE 1554+ // OH ISSUE: png optimize 1555+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1556+ (png_ptr->transformations & PNG_CHECK) == 0) 1557+ { 1558+ int rest = png_ptr->num_rows - png_ptr->row_number; 1559+ row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS; 1560+ } 1561+ png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth, 1562+ png_ptr->iwidth) + 1) * row_num; 1563+#else 1564 png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth, 1565 png_ptr->iwidth) + 1); 1566- 1567+#endif 1568 png_ptr->zstream.next_out = png_ptr->row_buf; 1569 } 1570 1571@@ -719,7 +840,12 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1572 1573 /* Do we have a complete row? */ 1574 if (png_ptr->zstream.avail_out == 0) 1575+#ifdef PNG_MULTY_LINE_ENABLE 1576+ // OH ISSUE: png optimize 1577+ png_push_process_multi_rows(png_ptr, row_num); 1578+#else 1579 png_push_process_row(png_ptr); 1580+#endif 1581 } 1582 1583 /* And check for the end of the stream. */ 1584@@ -738,6 +864,7 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1585 void /* PRIVATE */ 1586 png_push_process_row(png_structrp png_ptr) 1587 { 1588+ png_debug(1, "in png_push_process_row"); 1589 /* 1.5.6: row_info moved out of png_struct to a local here. */ 1590 png_row_info row_info; 1591 1592@@ -762,8 +889,17 @@ png_push_process_row(png_structrp png_ptr) 1593 * it may not be in the future, so this was changed just to copy the 1594 * interlaced row count: 1595 */ 1596- memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1597- 1598+#ifdef PNG_MULTY_LINE_ENABLE 1599+ // OH ISSUE: png optimize 1600+ if (png_ptr->transformations == 0 && png_ptr->interlaced == 0) 1601+ { 1602+ png_ptr->prev_row = png_ptr->row_buf; 1603+ } 1604+ else 1605+#endif 1606+ { 1607+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1608+ } 1609 #ifdef PNG_READ_TRANSFORMS_SUPPORTED 1610 if (png_ptr->transformations != 0) 1611 png_do_read_transformations(png_ptr, &row_info); 1612diff --git a/pngpriv.h b/pngpriv.h 1613index fb521cf00..6027d9acc 100644 1614--- a/pngpriv.h 1615+++ b/pngpriv.h 1616@@ -189,6 +189,20 @@ 1617 # define PNG_ARM_NEON_IMPLEMENTATION 0 1618 #endif /* PNG_ARM_NEON_OPT > 0 */ 1619 1620+#if defined(PNG_ARM_NEON_IMPLEMENTATION) && defined(PNG_ARM_NEON) 1621+// OH ISSUE: png optimize 1622+# if PNG_ARM_NEON_IMPLEMENTATION == 1 1623+# define PNG_MULTY_LINE_ENABLE 1624+# define PNG_WRITE_NEON_ENABLE 1625+# define PNG_INFLATE_MAX_SIZE (65536) 1626+# define PNG_INFLATE_ROWS (50) 1627+# define PNG_CHECK (PNG_EXPAND | PNG_STRIP_ALPHA | PNG_RGB_TO_GRAY | PNG_ENCODE_ALPHA | \ 1628+ PNG_PACKSWAP | PNG_GRAY_TO_RGB | PNG_COMPOSE | PNG_SCALE_16_TO_8 | PNG_16_TO_8 | \ 1629+ PNG_BACKGROUND_EXPAND | PNG_EXPAND_16 | PNG_PACK | PNG_ADD_ALPHA | PNG_EXPAND_tRNS | \ 1630+ PNG_RGB_TO_GRAY_ERR | PNG_RGB_TO_GRAY_WARN | PNG_FILLER | PNG_USER_TRANSFORM) 1631+# endif 1632+#endif 1633+ 1634 #ifndef PNG_MIPS_MSA_OPT 1635 # if defined(__mips_msa) && (__mips_isa_rev >= 5) && defined(PNG_ALIGNED_MEMORY_SUPPORTED) 1636 # define PNG_MIPS_MSA_OPT 2 1637@@ -351,8 +365,14 @@ 1638 #endif 1639 1640 #ifndef PNG_INTERNAL_FUNCTION 1641+// OH ISSUE: png optimize 1642+# ifdef PNG_MULTY_LINE_ENABLE 1643+# define PNG_HIDE __attribute__((visibility("hidden"))) 1644+# else 1645+# define PNG_HIDE 1646+# endif 1647 # define PNG_INTERNAL_FUNCTION(type, name, args, attributes)\ 1648- PNG_LINKAGE_FUNCTION PNG_FUNCTION(type, name, args, PNG_EMPTY attributes) 1649+ PNG_LINKAGE_FUNCTION PNG_FUNCTION(type, name, args, PNG_HIDE attributes) 1650 #endif 1651 1652 #ifndef PNG_INTERNAL_CALLBACK 1653@@ -1304,6 +1324,50 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_neon,(png_row_infop 1654 row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1655 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_neon,(png_row_infop 1656 row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1657+#ifdef PNG_MULTY_LINE_ENABLE 1658+// OH ISSUE: png optimize 1659+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_up_x2_neon, (png_row_infop 1660+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY); 1661+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_avg3_x2_neon, (png_row_infop 1662+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY); 1663+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_avg4_x2_neon, (png_row_infop 1664+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY); 1665+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_paeth3_x2_neon, (png_row_infop 1666+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY); 1667+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_paeth4_x2_neon, (png_row_infop 1668+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY); 1669+#endif 1670+#ifdef PNG_WRITE_NEON_ENABLE 1671+// OH ISSUE: png optimize 1672+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_sub3_neon, (png_structrp 1673+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY); 1674+PNG_INTERNAL_FUNCTION(void, png_write_filter_sub3_neon_only, (png_structrp 1675+ png_ptr, size_t row_bytes), PNG_EMPTY); 1676+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_sub4_neon, (png_structrp 1677+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY); 1678+PNG_INTERNAL_FUNCTION(void, png_write_filter_sub4_neon_only, (png_structrp 1679+ png_ptr, size_t row_bytes), PNG_EMPTY); 1680+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_up_neon, (png_structrp 1681+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY); 1682+PNG_INTERNAL_FUNCTION(void, png_write_filter_up_neon_only, (png_structrp 1683+ png_ptr, size_t row_bytes), PNG_EMPTY); 1684+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_avg3_neon, (png_structrp 1685+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY); 1686+PNG_INTERNAL_FUNCTION(void, png_write_filter_avg3_neon_only, (png_structrp 1687+ png_ptr, size_t row_bytes), PNG_EMPTY); 1688+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_avg4_neon, (png_structrp 1689+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY); 1690+PNG_INTERNAL_FUNCTION(void, png_write_filter_avg4_neon_only, (png_structrp 1691+ png_ptr, size_t row_bytes), PNG_EMPTY); 1692+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_paeth3_neon, (png_structrp 1693+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY); 1694+PNG_INTERNAL_FUNCTION(void, png_write_filter_paeth3_neon_only, (png_structrp 1695+ png_ptr, size_t row_bytes), PNG_EMPTY); 1696+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_paeth4_neon, (png_structrp 1697+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY); 1698+PNG_INTERNAL_FUNCTION(void, png_write_filter_paeth4_neon_only, (png_structrp 1699+ png_ptr, size_t row_bytes), PNG_EMPTY); 1700+#endif 1701 #endif 1702 1703 #if PNG_MIPS_MSA_OPT > 0 1704diff --git a/pngread.c b/pngread.c 1705index 8fa7d9f16..71be1a26c 100644 1706--- a/pngread.c 1707+++ b/pngread.c 1708@@ -54,7 +54,12 @@ png_create_read_struct_2,(png_const_charp user_png_ver, png_voidp error_ptr, 1709 * required (it will be zero in a write structure.) 1710 */ 1711 # ifdef PNG_SEQUENTIAL_READ_SUPPORTED 1712+#ifdef PNG_MULTY_LINE_ENABLE 1713+ // OH ISSUE: png optimize 1714+ png_ptr->IDAT_read_size = PNG_INFLATE_MAX_SIZE; 1715+#else 1716 png_ptr->IDAT_read_size = PNG_IDAT_READ_SIZE; 1717+#endif 1718 # endif 1719 1720 # ifdef PNG_BENIGN_READ_ERRORS_SUPPORTED 1721@@ -684,6 +689,224 @@ png_read_rows(png_structrp png_ptr, png_bytepp row, 1722 #endif /* SEQUENTIAL_READ */ 1723 1724 #ifdef PNG_SEQUENTIAL_READ_SUPPORTED 1725+ 1726+#ifdef PNG_MULTY_LINE_ENABLE 1727+// OH ISSUE: png optimize 1728+static void png_read_two_rows(png_structrp png_ptr, png_bytepp rows, png_uint_32 i, 1729+ png_row_info row_info) 1730+{ 1731+ png_debug1(1, "in png_read_two_rows %d", png_ptr->row_buf[0]); 1732+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, 1733+ png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4); 1734+ 1735+#ifdef PNG_MNG_FEATURES_SUPPORTED 1736+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 && 1737+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING)) 1738+ { 1739+ /* Intrapixel differencing */ 1740+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1); 1741+ } 1742+#endif 1743+ 1744+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1745+ if (png_ptr->transformations 1746+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED 1747+ || png_ptr->num_palette_max >= 0 1748+# endif 1749+ ) 1750+ png_do_read_transformations(png_ptr, &row_info); 1751+#endif 1752+ 1753+ /* The transformed pixel depth should match the depth now in row_info. */ 1754+ if (png_ptr->transformed_pixel_depth == 0) 1755+ { 1756+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1757+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1758+ png_error(png_ptr, "sequential row overflow"); 1759+ } 1760+ 1761+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth) 1762+ png_error(png_ptr, "internal sequential row size calculation error"); 1763+ 1764+ if (rows[i] != NULL) 1765+ png_combine_row(png_ptr, rows[i], -1); 1766+ 1767+ png_read_finish_row(png_ptr); 1768+ 1769+ if (png_ptr->read_row_fn != NULL) 1770+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass); 1771+ 1772+ png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1; 1773+ 1774+ // do again next line 1775+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1776+ 1777+#ifdef PNG_MNG_FEATURES_SUPPORTED 1778+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 && 1779+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING)) 1780+ { 1781+ /* Intrapixel differencing */ 1782+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1); 1783+ } 1784+#endif 1785+ 1786+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1787+ if (png_ptr->transformations 1788+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED 1789+ || png_ptr->num_palette_max >= 0 1790+# endif 1791+ ) 1792+ png_do_read_transformations(png_ptr, &row_info); 1793+#endif 1794+ 1795+ /* The transformed pixel depth should match the depth now in row_info. */ 1796+ if (png_ptr->transformed_pixel_depth == 0) 1797+ { 1798+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1799+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1800+ png_error(png_ptr, "sequential row overflow"); 1801+ } 1802+ 1803+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth) 1804+ png_error(png_ptr, "internal sequential row size calculation error"); 1805+ 1806+ if (rows[i+1] != NULL) 1807+ png_combine_row(png_ptr, rows[i+1], -1); 1808+ 1809+ png_read_finish_row(png_ptr); 1810+ 1811+ if (png_ptr->read_row_fn != NULL) 1812+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass); 1813+ 1814+ png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1; 1815+} 1816+ 1817+static void png_read_muilty_rows(png_structrp png_ptr, png_bytepp rows, 1818+ png_uint_32 row_num, png_row_info row_info_in) 1819+{ 1820+ if (png_ptr == NULL) 1821+ return; 1822+ 1823+ png_debug2(1, "in png_read_muilty_rows (row %lu, pass %d)", 1824+ (unsigned long)png_ptr->row_number, png_ptr->pass); 1825+ 1826+ if ((png_ptr->mode & PNG_HAVE_IDAT) == 0) 1827+ png_error(png_ptr, "Invalid attempt to read row data"); 1828+ 1829+ /* Fill the row with IDAT data: */ 1830+ uInt row_bytes = row_info_in.rowbytes; 1831+ png_ptr->row_buf[0]=255; /* 255 to force error if no data was found */ 1832+ png_read_IDAT_data(png_ptr, png_ptr->row_buf, (row_bytes + 1) * row_num); 1833+ png_bytep temp_row = png_ptr->row_buf; 1834+ 1835+ for (png_uint_32 i = 0; i < row_num; i++) { 1836+ png_row_info row_info = row_info_in; 1837+ // check if the x2_filter is effective: only supports channels 3 or 4 1838+ if ((row_info_in.channels == 3 || row_info_in.channels == 4) && 1839+ i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB && 1840+ png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST && 1841+ png_ptr->row_buf[0] == png_ptr->row_buf[row_info_in.rowbytes + 1]) 1842+ { 1843+ png_read_two_rows(png_ptr, rows, i, row_info); 1844+ i++; 1845+ continue; 1846+ } 1847+ if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE) 1848+ { 1849+ if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST) 1850+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, 1851+ png_ptr->prev_row + 1, png_ptr->row_buf[0]); 1852+ else 1853+ png_debug1(1, "bad adaptive filter value %d", png_ptr->row_buf[0]); 1854+ } 1855+ 1856+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info_in.rowbytes + 1); 1857+ 1858+#ifdef PNG_MNG_FEATURES_SUPPORTED 1859+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 && 1860+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING)) 1861+ { 1862+ /* Intrapixel differencing */ 1863+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1); 1864+ } 1865+#endif 1866+ 1867+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1868+ if (png_ptr->transformations 1869+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED 1870+ || png_ptr->num_palette_max >= 0 1871+# endif 1872+ ) 1873+ png_do_read_transformations(png_ptr, &row_info); 1874+#endif 1875+ 1876+ /* The transformed pixel depth should match the depth now in row_info. */ 1877+ if (png_ptr->transformed_pixel_depth == 0) 1878+ { 1879+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1880+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1881+ png_error(png_ptr, "sequential row overflow"); 1882+ } 1883+ 1884+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth) 1885+ png_error(png_ptr, "internal sequential row size calculation error"); 1886+ 1887+ if (rows[i] != NULL) 1888+ png_combine_row(png_ptr, rows[i], -1); 1889+ 1890+ png_read_finish_row(png_ptr); 1891+ 1892+ if (png_ptr->read_row_fn != NULL) 1893+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass); 1894+ 1895+ png_ptr->row_buf = png_ptr->row_buf + row_bytes + 1; 1896+ } 1897+ png_ptr->row_buf = temp_row; 1898+} 1899+ 1900+static void png_warn_check(png_structrp png_ptr) 1901+{ 1902+#ifdef PNG_WARNINGS_SUPPORTED 1903+ /* Check for transforms that have been set but were defined out */ 1904+#if defined(PNG_WRITE_INVERT_SUPPORTED) && !defined(PNG_READ_INVERT_SUPPORTED) 1905+ if ((png_ptr->transformations & PNG_INVERT_MONO) != 0) 1906+ png_warning(png_ptr, "PNG_READ_INVERT_SUPPORTED is not defined"); 1907+#endif 1908+ 1909+#if defined(PNG_WRITE_FILLER_SUPPORTED) && !defined(PNG_READ_FILLER_SUPPORTED) 1910+ if ((png_ptr->transformations & PNG_FILLER) != 0) 1911+ png_warning(png_ptr, "PNG_READ_FILLER_SUPPORTED is not defined"); 1912+#endif 1913+ 1914+#if defined(PNG_WRITE_PACKSWAP_SUPPORTED) && \ 1915+ !defined(PNG_READ_PACKSWAP_SUPPORTED) 1916+ if ((png_ptr->transformations & PNG_PACKSWAP) != 0) 1917+ png_warning(png_ptr, "PNG_READ_PACKSWAP_SUPPORTED is not defined"); 1918+#endif 1919+ 1920+#if defined(PNG_WRITE_PACK_SUPPORTED) && !defined(PNG_READ_PACK_SUPPORTED) 1921+ if ((png_ptr->transformations & PNG_PACK) != 0) 1922+ png_warning(png_ptr, "PNG_READ_PACK_SUPPORTED is not defined"); 1923+#endif 1924+ 1925+#if defined(PNG_WRITE_SHIFT_SUPPORTED) && !defined(PNG_READ_SHIFT_SUPPORTED) 1926+ if ((png_ptr->transformations & PNG_SHIFT) != 0) 1927+ png_warning(png_ptr, "PNG_READ_SHIFT_SUPPORTED is not defined"); 1928+#endif 1929+ 1930+#if defined(PNG_WRITE_BGR_SUPPORTED) && !defined(PNG_READ_BGR_SUPPORTED) 1931+ if ((png_ptr->transformations & PNG_BGR) != 0) 1932+ png_warning(png_ptr, "PNG_READ_BGR_SUPPORTED is not defined"); 1933+#endif 1934+ 1935+#if defined(PNG_WRITE_SWAP_SUPPORTED) && !defined(PNG_READ_SWAP_SUPPORTED) 1936+ if ((png_ptr->transformations & PNG_SWAP_BYTES) != 0) 1937+ png_warning(png_ptr, "PNG_READ_SWAP_SUPPORTED is not defined"); 1938+#endif 1939+#endif /* WARNINGS */ 1940+} 1941+#endif // PNG_MULTY_LINE_ENABLE 1942+ 1943 /* Read the entire image. If the image has an alpha channel or a tRNS 1944 * chunk, and you have called png_handle_alpha()[*], you will need to 1945 * initialize the image to the current image that PNG will be overlaying. 1946@@ -745,13 +968,45 @@ png_read_image(png_structrp png_ptr, png_bytepp image) 1947 1948 image_height=png_ptr->height; 1949 1950- for (j = 0; j < pass; j++) 1951- { 1952+#ifdef PNG_MULTY_LINE_ENABLE 1953+ // OH ISSUE: png optimize 1954+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1955+ (png_ptr->transformations & PNG_CHECK) == 0) { 1956+ if ((png_ptr->flags & PNG_FLAG_ROW_INIT) == 0) 1957+ png_read_start_row(png_ptr); 1958+ 1959+ png_warn_check(png_ptr); 1960+ png_row_info row_info; 1961+ row_info.width = png_ptr->iwidth; 1962+ row_info.color_type = png_ptr->color_type; 1963+ row_info.bit_depth = png_ptr->bit_depth; 1964+ row_info.channels = png_ptr->channels; 1965+ row_info.pixel_depth = png_ptr->pixel_depth; 1966+ row_info.rowbytes = png_ptr->rowbytes; 1967+ 1968 rp = image; 1969- for (i = 0; i < image_height; i++) 1970+ int row_num = PNG_INFLATE_ROWS; 1971+ for (i = 0; i < image_height; i += PNG_INFLATE_ROWS) 1972 { 1973- png_read_row(png_ptr, *rp, NULL); 1974- rp++; 1975+ if (image_height - i < PNG_INFLATE_ROWS) 1976+ { 1977+ row_num = image_height - i; 1978+ } 1979+ png_read_muilty_rows(png_ptr, rp, row_num, row_info); 1980+ rp += row_num; 1981+ } 1982+ } 1983+ else 1984+#endif 1985+ { 1986+ for (j = 0; j < pass; j++) 1987+ { 1988+ rp = image; 1989+ for (i = 0; i < image_height; i++) 1990+ { 1991+ png_read_row(png_ptr, *rp, NULL); 1992+ rp++; 1993+ } 1994 } 1995 } 1996 } 1997diff --git a/pngrutil.c b/pngrutil.c 1998index 9ac8ec11f..8afdf4fa5 100644 1999--- a/pngrutil.c 2000+++ b/pngrutil.c 2001@@ -4134,7 +4134,12 @@ png_read_filter_row(png_structrp pp, png_row_infop row_info, png_bytep row, 2002 * PNG_FILTER_OPTIMIZATIONS to a function that overrides the generic 2003 * implementations. See png_init_filter_functions above. 2004 */ 2005+#ifdef PNG_MULTY_LINE_ENABLE 2006+ // OH ISSUE: png optimize 2007+ if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST_X2) 2008+#else 2009 if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST) 2010+#endif 2011 { 2012 if (pp->read_filter[0] == NULL) 2013 png_init_filter_functions(pp); 2014@@ -4606,7 +4611,24 @@ defined(PNG_USER_TRANSFORM_PTR_SUPPORTED) 2015 row_bytes + 48); 2016 2017 else 2018+ { 2019+#ifdef PNG_MULTY_LINE_ENABLE 2020+ // OH ISSUE: png optimize 2021+ png_uint_32 row_num = 1; 2022+ if (png_ptr->bit_depth == 8 && 2023+ (png_ptr->transformations & PNG_CHECK) == 0) 2024+ { 2025+ row_num = png_ptr->height < PNG_INFLATE_ROWS ? 2026+ png_ptr->height : PNG_INFLATE_ROWS; 2027+ } 2028+ png_ptr->big_row_buf = (png_bytep)png_malloc( 2029+ png_ptr, row_bytes * row_num + 48); 2030+ if (png_ptr->big_row_buf == NULL) 2031+ png_error(png_ptr, "png_malloc failed"); 2032+#else 2033 png_ptr->big_row_buf = (png_bytep)png_malloc(png_ptr, row_bytes + 48); 2034+#endif 2035+ } 2036 2037 png_ptr->big_prev_row = (png_bytep)png_malloc(png_ptr, row_bytes + 48); 2038 2039diff --git a/pngstruct.h b/pngstruct.h 2040index e591d94d5..7c3846475 100644 2041--- a/pngstruct.h 2042+++ b/pngstruct.h 2043@@ -140,6 +140,14 @@ typedef const png_colorspace * PNG_RESTRICT png_const_colorspacerp; 2044 #define PNG_COLORSPACE_CANCEL(flags) (0xffff ^ (flags)) 2045 #endif /* COLORSPACE || GAMMA */ 2046 2047+#ifdef PNG_MULTY_LINE_ENABLE 2048+// OH ISSUE: png optimize 2049+#define PNG_FILTER_VALUE_UP_X2 (6) // PNG_FILTER_VALUE_UP + 4 2050+#define PNG_FILTER_VALUE_AVG_X2 (7) // PNG_FILTER_VALUE_AVG + 4 2051+#define PNG_FILTER_VALUE_PAETH_X2 (8) // PNG_FILTER_VALUE_PAETH + 4 2052+#define PNG_FILTER_VALUE_LAST_X2 (9) // PNG_FILTER_VALUE_LAST + 4 2053+#endif 2054+ 2055 struct png_struct_def 2056 { 2057 #ifdef PNG_SETJMP_SUPPORTED 2058@@ -467,8 +475,14 @@ struct png_struct_def 2059 png_bytep big_prev_row; 2060 2061 /* New member added in libpng-1.5.7 */ 2062+#ifdef PNG_MULTY_LINE_ENABLE 2063+ // OH ISSUE: png optimize 2064+ void (*read_filter[PNG_FILTER_VALUE_LAST_X2 - 1])(png_row_infop row_info, 2065+ png_bytep row, png_const_bytep prev_row); 2066+#else 2067 void (*read_filter[PNG_FILTER_VALUE_LAST-1])(png_row_infop row_info, 2068 png_bytep row, png_const_bytep prev_row); 2069+#endif 2070 2071 #ifdef PNG_READ_SUPPORTED 2072 #if defined(PNG_COLORSPACE_SUPPORTED) || defined(PNG_GAMMA_SUPPORTED) 2073diff --git a/pngtrans.c b/pngtrans.c 2074index 1100f46eb..99736747a 100644 2075--- a/pngtrans.c 2076+++ b/pngtrans.c 2077@@ -13,6 +13,19 @@ 2078 2079 #include "pngpriv.h" 2080 2081+#ifdef PNG_MULTY_LINE_ENABLE 2082+# if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64) 2083+# include <arm64_neon.h> 2084+# else 2085+# include <arm_neon.h> 2086+# endif 2087+# define STEP_GRAY (16) 2088+# define STEP_GA (32) 2089+# define STEP_RGB (48) 2090+# define STEP_RGBA (64) 2091+# define INDEX2 (2) 2092+#endif 2093+ 2094 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED) 2095 2096 #if defined(PNG_READ_BGR_SUPPORTED) || defined(PNG_WRITE_BGR_SUPPORTED) 2097@@ -269,13 +282,19 @@ png_do_invert(png_row_infop row_info, png_bytep row) 2098 if (row_info->color_type == PNG_COLOR_TYPE_GRAY) 2099 { 2100 png_bytep rp = row; 2101- size_t i; 2102- size_t istop = row_info->rowbytes; 2103- 2104- for (i = 0; i < istop; i++) 2105+ png_bytep rp_stop = row + row_info->rowbytes; 2106+#ifdef PNG_MULTY_LINE_ENABLE 2107+ png_bytep rp_stop_neon = rp_stop - STEP_GRAY; 2108+ for (; rp < rp_stop_neon; rp += STEP_GRAY) 2109+ { 2110+ uint8x16_t gray = vld1q_u8(rp); 2111+ gray = ~gray; 2112+ vst1q_u8(rp, gray); 2113+ } 2114+#endif 2115+ for (; rp < rp_stop; rp++) 2116 { 2117 *rp = (png_byte)(~(*rp)); 2118- rp++; 2119 } 2120 } 2121 2122@@ -283,13 +302,19 @@ png_do_invert(png_row_infop row_info, png_bytep row) 2123 row_info->bit_depth == 8) 2124 { 2125 png_bytep rp = row; 2126- size_t i; 2127- size_t istop = row_info->rowbytes; 2128- 2129- for (i = 0; i < istop; i += 2) 2130+ png_bytep rp_stop = row + row_info->rowbytes; 2131+#ifdef PNG_MULTY_LINE_ENABLE 2132+ png_bytep rp_stop_neon = rp_stop - STEP_GA; 2133+ for (; rp < rp_stop_neon; rp += STEP_GA) 2134+ { 2135+ uint8x16x2_t gray_alpha = vld2q_u8(rp); 2136+ gray_alpha.val[0] = ~gray_alpha.val[0]; 2137+ vst2q_u8(rp, gray_alpha); 2138+ } 2139+#endif 2140+ for (; rp < rp_stop; rp += 2) 2141 { 2142 *rp = (png_byte)(~(*rp)); 2143- rp += 2; 2144 } 2145 } 2146 2147@@ -298,14 +323,21 @@ png_do_invert(png_row_infop row_info, png_bytep row) 2148 row_info->bit_depth == 16) 2149 { 2150 png_bytep rp = row; 2151- size_t i; 2152- size_t istop = row_info->rowbytes; 2153- 2154- for (i = 0; i < istop; i += 4) 2155+ png_bytep rp_stop = row + row_info->rowbytes; 2156+#ifdef PNG_MULTY_LINE_ENABLE 2157+ png_bytep rp_stop_neon = rp_stop - STEP_RGBA; 2158+ for (; rp < rp_stop_neon; rp += STEP_RGBA) 2159+ { 2160+ uint8x16x4_t gray_alpha = vld4q_u8(rp); 2161+ gray_alpha.val[0] = ~gray_alpha.val[0]; 2162+ gray_alpha.val[1] = ~gray_alpha.val[1]; 2163+ vst4q_u8(rp, gray_alpha); 2164+ } 2165+#endif 2166+ for (; rp < rp_stop; rp += 4) 2167 { 2168 *rp = (png_byte)(~(*rp)); 2169 *(rp + 1) = (png_byte)(~(*(rp + 1))); 2170- rp += 4; 2171 } 2172 } 2173 #endif 2174@@ -323,10 +355,19 @@ png_do_swap(png_row_infop row_info, png_bytep row) 2175 if (row_info->bit_depth == 16) 2176 { 2177 png_bytep rp = row; 2178- png_uint_32 i; 2179- png_uint_32 istop= row_info->width * row_info->channels; 2180- 2181- for (i = 0; i < istop; i++, rp += 2) 2182+ png_bytep rp_stop = row + row_info->rowbytes; 2183+#ifdef PNG_MULTY_LINE_ENABLE 2184+ png_bytep rp_stop_neon = rp_stop - STEP_GA; 2185+ for (; rp < rp_stop_neon; rp += STEP_GA) 2186+ { 2187+ uint8x16x2_t gray = vld2q_u8(rp); 2188+ uint8x16_t tmp = gray.val[0]; 2189+ gray.val[0] = gray.val[1]; 2190+ gray.val[1] = tmp; 2191+ vst2q_u8(rp, gray); 2192+ } 2193+#endif 2194+ for (; rp < rp_stop; rp += 2) 2195 { 2196 #ifdef PNG_BUILTIN_BSWAP16_SUPPORTED 2197 /* Feature added to libpng-1.6.11 for testing purposes, not 2198@@ -622,15 +663,24 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2199 2200 if ((row_info->color_type & PNG_COLOR_MASK_COLOR) != 0) 2201 { 2202- png_uint_32 row_width = row_info->width; 2203 if (row_info->bit_depth == 8) 2204 { 2205 if (row_info->color_type == PNG_COLOR_TYPE_RGB) 2206 { 2207- png_bytep rp; 2208- png_uint_32 i; 2209- 2210- for (i = 0, rp = row; i < row_width; i++, rp += 3) 2211+ png_bytep rp = row; 2212+ png_bytep rp_stop = row + row_info->rowbytes; 2213+#ifdef PNG_MULTY_LINE_ENABLE 2214+ png_bytep rp_stop_neon = rp_stop - STEP_RGB; 2215+ for (; rp < rp_stop_neon; rp += STEP_RGB) 2216+ { 2217+ uint8x16x3_t bgr = vld3q_u8(rp); 2218+ uint8x16_t tmp = bgr.val[INDEX2]; 2219+ bgr.val[INDEX2] = bgr.val[0]; 2220+ bgr.val[0] = tmp; 2221+ vst3q_u8(rp, bgr); 2222+ } 2223+#endif 2224+ for (; rp < rp_stop; rp += 3) 2225 { 2226 png_byte save = *rp; 2227 *rp = *(rp + 2); 2228@@ -640,10 +690,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2229 2230 else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA) 2231 { 2232- png_bytep rp; 2233- png_uint_32 i; 2234- 2235- for (i = 0, rp = row; i < row_width; i++, rp += 4) 2236+ png_bytep rp = row; 2237+ png_bytep rp_stop = row + row_info->rowbytes; 2238+#ifdef PNG_MULTY_LINE_ENABLE 2239+ png_bytep rp_stop_neon = rp_stop - STEP_RGBA; 2240+ for (; rp < rp_stop_neon; rp += STEP_RGBA) 2241+ { 2242+ uint8x16x4_t bgra = vld4q_u8(rp); 2243+ uint8x16_t tmp = bgra.val[INDEX2]; 2244+ bgra.val[INDEX2] = bgra.val[0]; 2245+ bgra.val[0] = tmp; 2246+ vst4q_u8(rp, bgra); 2247+ } 2248+#endif 2249+ for (; rp < rp_stop; rp += 4) 2250 { 2251 png_byte save = *rp; 2252 *rp = *(rp + 2); 2253@@ -657,10 +717,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2254 { 2255 if (row_info->color_type == PNG_COLOR_TYPE_RGB) 2256 { 2257- png_bytep rp; 2258- png_uint_32 i; 2259- 2260- for (i = 0, rp = row; i < row_width; i++, rp += 6) 2261+ png_bytep rp = row; 2262+ png_bytep rp_stop = row + row_info->rowbytes; 2263+#ifdef PNG_MULTY_LINE_ENABLE 2264+ png_bytep rp_stop_neon = rp_stop - STEP_RGB; 2265+ for (; rp < rp_stop_neon; rp += STEP_RGB) 2266+ { 2267+ uint16x8x3_t bgr = vld3q_u16((unsigned short *)rp); 2268+ uint16x8_t tmp = bgr.val[INDEX2]; 2269+ bgr.val[INDEX2] = bgr.val[0]; 2270+ bgr.val[0] = tmp; 2271+ vst3q_u16((unsigned short *)rp, bgr); 2272+ } 2273+#endif 2274+ for (; rp < rp_stop; rp += 6) 2275 { 2276 png_byte save = *rp; 2277 *rp = *(rp + 4); 2278@@ -673,10 +743,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2279 2280 else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA) 2281 { 2282- png_bytep rp; 2283- png_uint_32 i; 2284- 2285- for (i = 0, rp = row; i < row_width; i++, rp += 8) 2286+ png_bytep rp = row; 2287+ png_bytep rp_stop = row + row_info->rowbytes; 2288+#ifdef PNG_MULTY_LINE_ENABLE 2289+ png_bytep rp_stop_neon = rp_stop - STEP_RGBA; 2290+ for (; rp < rp_stop_neon; rp += STEP_RGBA) 2291+ { 2292+ uint16x8x4_t bgra = vld4q_u16((unsigned short *)rp); 2293+ uint16x8_t tmp = bgra.val[INDEX2]; 2294+ bgra.val[INDEX2] = bgra.val[0]; 2295+ bgra.val[0] = tmp; 2296+ vst4q_u16((unsigned short *)rp, bgra); 2297+ } 2298+#endif 2299+ for (; rp < rp_stop; rp += 8) 2300 { 2301 png_byte save = *rp; 2302 *rp = *(rp + 4); 2303diff --git a/pngwutil.c b/pngwutil.c 2304index 16345e4c0..212c090b6 100644 2305--- a/pngwutil.c 2306+++ b/pngwutil.c 2307@@ -16,6 +16,20 @@ 2308 #ifdef PNG_WRITE_SUPPORTED 2309 2310 #ifdef PNG_WRITE_INT_FUNCTIONS_SUPPORTED 2311+#ifdef PNG_WRITE_NEON_ENABLE 2312+// OH ISSUE: png optimize 2313+# if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64) 2314+# include <arm64_neon.h> 2315+# else 2316+# include <arm_neon.h> 2317+# endif 2318+# define STEP 16 2319+# define MID 128 2320+# define SHIFT_RGB 13 2321+# define SHIFT_RGBA 12 2322+# define BYTE_RGB 3 2323+# define BYTE_RGBA 4 2324+#endif 2325 /* Place a 32-bit number into a buffer in PNG byte order. We work 2326 * with unsigned numbers for convenience, although one supported 2327 * ancillary chunk uses signed (two's complement) numbers. 2328@@ -2275,10 +2289,939 @@ png_write_filtered_row(png_structrp png_ptr, png_bytep filtered_row, 2329 size_t row_bytes); 2330 2331 #ifdef PNG_WRITE_FILTER_SUPPORTED 2332+#ifdef PNG_WRITE_NEON_ENABLE 2333+size_t png_write_filter_sub3_neon(png_structrp png_ptr, 2334+ size_t row_bytes, size_t lmins) 2335+{ 2336+ size_t sum = 0; 2337+ png_bytep rp = png_ptr->row_buf + 1; 2338+ png_bytep dp = png_ptr->try_row + 1; 2339+ png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB; 2340+ 2341+ size_t count = row_bytes; 2342+ uint8x16_t tmp = vdupq_n_u8(0); 2343+ while (count >= STEP) 2344+ { 2345+ uint8x16_t qrp = vld1q_u8(rp); 2346+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB); 2347+ uint8x16_t qdp = vsubq_u8(qrp, qlp); 2348+ vst1q_u8(dp, qdp); 2349+ tmp = qrp; 2350+ int8x16_t v_s = vreinterpretq_s8_u8(qdp); 2351+ v_s = vabsq_s8(v_s); 2352+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s); 2353+ sum += vaddlvq_u8(v_u); 2354+ rp += STEP; 2355+ dp += STEP; 2356+ count -= STEP; 2357+ if (sum > lmins) 2358+ { 2359+ return sum; 2360+ } 2361+ } 2362+ 2363+ if (count == row_bytes) 2364+ { 2365+ dp[0] = rp[0]; 2366+ dp[1] = rp[1]; 2367+ dp[2] = rp[2]; 2368+ sum += MID - abs((int)dp[0] - MID); 2369+ sum += MID - abs((int)dp[1] - MID); 2370+ sum += MID - abs((int)dp[2] - MID); 2371+ rp += BYTE_RGB; 2372+ dp += BYTE_RGB; 2373+ count -= BYTE_RGB; 2374+ } 2375+ 2376+ png_bytep lp = rp - BYTE_RGB; 2377+ while (count > 0) 2378+ { 2379+ *dp = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff); 2380+ sum += MID - abs((int)*dp++ - MID); 2381+ count--; 2382+ if (sum > lmins) 2383+ { 2384+ return sum; 2385+ } 2386+ } 2387+ return sum; 2388+} 2389+ 2390+void png_write_filter_sub3_neon_only(png_structrp png_ptr, 2391+ size_t row_bytes) 2392+{ 2393+ png_bytep rp = png_ptr->row_buf + 1; 2394+ png_bytep dp = png_ptr->try_row + 1; 2395+ png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB; 2396+ 2397+ size_t count = row_bytes; 2398+ uint8x16_t tmp = vdupq_n_u8(0); 2399+ while (count >= STEP) 2400+ { 2401+ uint8x16_t qrp = vld1q_u8(rp); 2402+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB); 2403+ uint8x16_t qdp = vsubq_u8(qrp, qlp); 2404+ vst1q_u8(dp, qdp); 2405+ tmp = qrp; 2406+ rp += STEP; 2407+ dp += STEP; 2408+ count -= STEP; 2409+ } 2410+ 2411+ if (count == row_bytes) 2412+ { 2413+ dp[0] = rp[0]; 2414+ dp[1] = rp[1]; 2415+ dp[2] = rp[2]; 2416+ rp += BYTE_RGB; 2417+ dp += BYTE_RGB; 2418+ count -= BYTE_RGB; 2419+ } 2420+ 2421+ png_bytep lp = rp - BYTE_RGB; 2422+ while (count > 0) 2423+ { 2424+ *dp++ = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff); 2425+ count--; 2426+ } 2427+} 2428+ 2429+size_t png_write_filter_sub4_neon(png_structrp png_ptr, 2430+ size_t row_bytes, size_t lmins) 2431+{ 2432+ size_t sum = 0; 2433+ png_bytep rp = png_ptr->row_buf + 1; 2434+ png_bytep dp = png_ptr->try_row + 1; 2435+ png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB; 2436+ 2437+ size_t count = row_bytes; 2438+ uint8x16_t tmp = vdupq_n_u8(0); 2439+ while (count >= STEP) 2440+ { 2441+ uint8x16_t qrp = vld1q_u8(rp); 2442+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA); 2443+ uint8x16_t qdp = vsubq_u8(qrp, qlp); 2444+ vst1q_u8(dp, qdp); 2445+ tmp = qrp; 2446+ int8x16_t v_s = vreinterpretq_s8_u8(qdp); 2447+ v_s = vabsq_s8(v_s); 2448+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s); 2449+ sum += vaddlvq_u8(v_u); 2450+ rp += STEP; 2451+ dp += STEP; 2452+ count -= STEP; 2453+ if (sum > lmins) 2454+ { 2455+ return sum; 2456+ } 2457+ } 2458+ 2459+ if (count == row_bytes) 2460+ { 2461+ dp[0] = rp[0]; 2462+ dp[1] = rp[1]; 2463+ dp[2] = rp[2]; 2464+ dp[3] = rp[3]; 2465+ sum += MID - abs((int)dp[0] - MID); 2466+ sum += MID - abs((int)dp[1] - MID); 2467+ sum += MID - abs((int)dp[2] - MID); 2468+ sum += MID - abs((int)dp[3] - MID); 2469+ rp += BYTE_RGBA; 2470+ dp += BYTE_RGBA; 2471+ count -= BYTE_RGBA; 2472+ } 2473+ 2474+ png_bytep lp = rp - BYTE_RGBA; 2475+ while (count > 0) 2476+ { 2477+ *dp = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff); 2478+ sum += MID - abs((int)*dp++ - MID); 2479+ count--; 2480+ if (sum > lmins) 2481+ { 2482+ return sum; 2483+ } 2484+ } 2485+ return sum; 2486+} 2487+ 2488+void png_write_filter_sub4_neon_only(png_structrp png_ptr, 2489+ size_t row_bytes) 2490+{ 2491+ png_bytep rp = png_ptr->row_buf + 1; 2492+ png_bytep dp = png_ptr->try_row + 1; 2493+ png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB; 2494+ 2495+ size_t count = row_bytes; 2496+ uint8x16_t tmp = vdupq_n_u8(0); 2497+ while (count >= STEP) 2498+ { 2499+ uint8x16_t qrp = vld1q_u8(rp); 2500+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA); 2501+ uint8x16_t qdp = vsubq_u8(qrp, qlp); 2502+ vst1q_u8(dp, qdp); 2503+ tmp = qrp; 2504+ rp += STEP; 2505+ dp += STEP; 2506+ count -= STEP; 2507+ } 2508+ 2509+ if (count == row_bytes) 2510+ { 2511+ dp[0] = rp[0]; 2512+ dp[1] = rp[1]; 2513+ dp[2] = rp[2]; 2514+ dp[3] = rp[3]; 2515+ rp += BYTE_RGBA; 2516+ dp += BYTE_RGBA; 2517+ count -= BYTE_RGBA; 2518+ } 2519+ 2520+ png_bytep lp = rp - BYTE_RGBA; 2521+ while (count > 0) 2522+ { 2523+ *dp++ = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff); 2524+ count--; 2525+ } 2526+} 2527+ 2528+size_t png_write_filter_up_neon(png_structrp png_ptr, 2529+ size_t row_bytes, size_t lmins) 2530+{ 2531+ size_t sum = 0; 2532+ png_bytep rp = png_ptr->row_buf + 1; 2533+ png_bytep dp = png_ptr->try_row + 1; 2534+ png_bytep pp = png_ptr->prev_row + 1; 2535+ png_ptr->try_row[0] = PNG_FILTER_VALUE_UP; 2536+ 2537+ size_t count = row_bytes; 2538+ while (count >= STEP) 2539+ { 2540+ uint8x16_t qrp = vld1q_u8(rp); 2541+ uint8x16_t qpp = vld1q_u8(pp); 2542+ uint8x16_t qdp = vsubq_u8(qrp, qpp); 2543+ vst1q_u8(dp, qdp); 2544+ int8x16_t v_s = vreinterpretq_s8_u8(qdp); 2545+ v_s = vabsq_s8(v_s); 2546+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s); 2547+ sum += vaddlvq_u8(v_u); 2548+ rp += STEP; 2549+ pp += STEP; 2550+ dp += STEP; 2551+ count -= STEP; 2552+ if (sum > lmins) 2553+ { 2554+ return sum; 2555+ } 2556+ } 2557+ 2558+ while (count > 0) 2559+ { 2560+ *dp = (png_byte)(((int)*rp++ - (int)*pp++) & 0xff); 2561+ sum += MID - abs((int)*dp++ - MID); 2562+ count--; 2563+ if (sum > lmins) 2564+ { 2565+ return sum; 2566+ } 2567+ } 2568+ return sum; 2569+} 2570+ 2571+void png_write_filter_up_neon_only(png_structrp png_ptr, size_t row_bytes) 2572+{ 2573+ png_bytep rp = png_ptr->row_buf + 1; 2574+ png_bytep dp = png_ptr->try_row + 1; 2575+ png_bytep pp = png_ptr->prev_row + 1; 2576+ png_ptr->try_row[0] = PNG_FILTER_VALUE_UP; 2577+ 2578+ size_t count = row_bytes; 2579+ while (count >= STEP) 2580+ { 2581+ uint8x16_t qrp = vld1q_u8(rp); 2582+ uint8x16_t qpp = vld1q_u8(pp); 2583+ uint8x16_t qdp = vsubq_u8(qrp, qpp); 2584+ vst1q_u8(dp, qdp); 2585+ rp += STEP; 2586+ pp += STEP; 2587+ dp += STEP; 2588+ count -= STEP; 2589+ } 2590+ 2591+ while (count > 0) 2592+ { 2593+ *dp++ = (png_byte)(((int)*rp++ - (int)*pp++) & 0xff); 2594+ count--; 2595+ } 2596+} 2597+ 2598+size_t png_write_filter_avg3_neon(png_structrp png_ptr, 2599+ size_t row_bytes, size_t lmins) 2600+{ 2601+ size_t sum = 0; 2602+ png_bytep rp = png_ptr->row_buf + 1; 2603+ png_bytep pp = png_ptr->prev_row + 1; 2604+ png_bytep dp = png_ptr->try_row + 1; 2605+ png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG; 2606+ 2607+ size_t count = row_bytes; 2608+ uint8x16_t tmp = vdupq_n_u8(0); 2609+ while (count >= STEP) 2610+ { 2611+ uint8x16_t qrp = vld1q_u8(rp); 2612+ uint8x16_t qpp = vld1q_u8(pp); 2613+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB); 2614+ qlp = vhaddq_u8(qpp, qlp); 2615+ uint8x16_t qdp = vsubq_u8(qrp, qlp); 2616+ vst1q_u8(dp, qdp); 2617+ int8x16_t v_s = vreinterpretq_s8_u8(qdp); 2618+ v_s = vabsq_s8(v_s); 2619+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s); 2620+ sum += vaddlvq_u8(v_u); 2621+ tmp = qrp; 2622+ rp += STEP; 2623+ pp += STEP; 2624+ dp += STEP; 2625+ count -= STEP; 2626+ if (sum > lmins) 2627+ { 2628+ return sum; 2629+ } 2630+ } 2631+ 2632+ if (count == row_bytes) 2633+ { 2634+ dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff); 2635+ dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff); 2636+ dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff); 2637+ sum += MID - abs((int)dp[0] - MID); 2638+ sum += MID - abs((int)dp[1] - MID); 2639+ sum += MID - abs((int)dp[2] - MID); 2640+ rp += BYTE_RGB; 2641+ pp += BYTE_RGB; 2642+ dp += BYTE_RGB; 2643+ count -= BYTE_RGB; 2644+ } 2645+ 2646+ png_bytep lp = rp - BYTE_RGB; 2647+ while (count > 0) 2648+ { 2649+ *dp = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff); 2650+ count--; 2651+ sum += MID - abs((int)*dp++ - MID); 2652+ if (sum > lmins) 2653+ { 2654+ return sum; 2655+ } 2656+ } 2657+ return sum; 2658+} 2659+ 2660+void png_write_filter_avg3_neon_only(png_structrp png_ptr, 2661+ size_t row_bytes) 2662+{ 2663+ png_bytep rp = png_ptr->row_buf + 1; 2664+ png_bytep pp = png_ptr->prev_row + 1; 2665+ png_bytep dp = png_ptr->try_row + 1; 2666+ png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG; 2667+ 2668+ size_t count = row_bytes; 2669+ uint8x16_t tmp = vdupq_n_u8(0); 2670+ while (count >= STEP) 2671+ { 2672+ uint8x16_t qrp = vld1q_u8(rp); 2673+ uint8x16_t qpp = vld1q_u8(pp); 2674+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB); 2675+ qlp = vhaddq_u8(qpp, qlp); 2676+ uint8x16_t qdp = vsubq_u8(qrp, qlp); 2677+ vst1q_u8(dp, qdp); 2678+ tmp = qrp; 2679+ rp += STEP; 2680+ pp += STEP; 2681+ dp += STEP; 2682+ count -= STEP; 2683+ } 2684+ 2685+ if (count == row_bytes) 2686+ { 2687+ dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff); 2688+ dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff); 2689+ dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff); 2690+ rp += BYTE_RGB; 2691+ pp += BYTE_RGB; 2692+ dp += BYTE_RGB; 2693+ count -= BYTE_RGB; 2694+ } 2695+ 2696+ png_bytep lp = rp - BYTE_RGB; 2697+ while (count > 0) 2698+ { 2699+ *dp++ = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff); 2700+ count--; 2701+ } 2702+} 2703+ 2704+size_t png_write_filter_avg4_neon(png_structrp png_ptr, 2705+ size_t row_bytes, size_t lmins) 2706+{ 2707+ size_t sum = 0; 2708+ png_bytep rp = png_ptr->row_buf + 1; 2709+ png_bytep pp = png_ptr->prev_row + 1; 2710+ png_bytep dp = png_ptr->try_row + 1; 2711+ png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG; 2712+ 2713+ size_t count = row_bytes; 2714+ uint8x16_t tmp = vdupq_n_u8(0); 2715+ while (count >= STEP) 2716+ { 2717+ uint8x16_t qrp = vld1q_u8(rp); 2718+ uint8x16_t qpp = vld1q_u8(pp); 2719+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA); 2720+ qlp = vhaddq_u8(qpp, qlp); 2721+ uint8x16_t qdp = vsubq_u8(qrp, qlp); 2722+ vst1q_u8(dp, qdp); 2723+ int8x16_t v_s = vreinterpretq_s8_u8(qdp); 2724+ v_s = vabsq_s8(v_s); 2725+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s); 2726+ sum += vaddlvq_u8(v_u); 2727+ tmp = qrp; 2728+ rp += STEP; 2729+ pp += STEP; 2730+ dp += STEP; 2731+ count -= STEP; 2732+ if (sum > lmins) 2733+ { 2734+ return sum; 2735+ } 2736+ } 2737+ 2738+ if (count == row_bytes) 2739+ { 2740+ dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff); 2741+ dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff); 2742+ dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff); 2743+ dp[3] = (png_byte)(((int)rp[3] - ((int)pp[3] / 2)) & 0xff); 2744+ sum += MID - abs((int)dp[0] - MID); 2745+ sum += MID - abs((int)dp[1] - MID); 2746+ sum += MID - abs((int)dp[2] - MID); 2747+ sum += MID - abs((int)dp[3] - MID); 2748+ rp += BYTE_RGBA; 2749+ pp += BYTE_RGBA; 2750+ dp += BYTE_RGBA; 2751+ count -= BYTE_RGBA; 2752+ } 2753+ 2754+ png_bytep lp = rp - BYTE_RGBA; 2755+ while (count > 0) 2756+ { 2757+ *dp = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff); 2758+ count--; 2759+ sum += MID - abs((int)*dp++ - MID); 2760+ if (sum > lmins) 2761+ { 2762+ return sum; 2763+ } 2764+ } 2765+ return sum; 2766+} 2767+ 2768+void png_write_filter_avg4_neon_only(png_structrp png_ptr, 2769+ size_t row_bytes) 2770+{ 2771+ png_bytep rp = png_ptr->row_buf + 1; 2772+ png_bytep pp = png_ptr->prev_row + 1; 2773+ png_bytep dp = png_ptr->try_row + 1; 2774+ png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG; 2775+ 2776+ size_t count = row_bytes; 2777+ uint8x16_t tmp = vdupq_n_u8(0); 2778+ while (count >= STEP) 2779+ { 2780+ uint8x16_t qrp = vld1q_u8(rp); 2781+ uint8x16_t qpp = vld1q_u8(pp); 2782+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA); 2783+ qlp = vhaddq_u8(qpp, qlp); 2784+ uint8x16_t qdp = vsubq_u8(qrp, qlp); 2785+ vst1q_u8(dp, qdp); 2786+ tmp = qrp; 2787+ rp += STEP; 2788+ pp += STEP; 2789+ dp += STEP; 2790+ count -= STEP; 2791+ } 2792+ 2793+ if (count == row_bytes) 2794+ { 2795+ dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff); 2796+ dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff); 2797+ dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff); 2798+ dp[3] = (png_byte)(((int)rp[3] - ((int)pp[3] / 2)) & 0xff); 2799+ rp += BYTE_RGBA; 2800+ pp += BYTE_RGBA; 2801+ dp += BYTE_RGBA; 2802+ count -= BYTE_RGBA; 2803+ } 2804+ 2805+ png_bytep lp = rp - BYTE_RGBA; 2806+ while (count > 0) 2807+ { 2808+ *dp++ = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff); 2809+ count--; 2810+ } 2811+} 2812+ 2813+size_t png_write_filter_paeth3_neon(png_structrp png_ptr, 2814+ size_t row_bytes, size_t lmins) 2815+{ 2816+ size_t sum = 0; 2817+ png_bytep rp = png_ptr->row_buf + 1; 2818+ png_bytep pp = png_ptr->prev_row + 1; 2819+ png_bytep dp = png_ptr->try_row + 1; 2820+ png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH; 2821+ 2822+ size_t count = row_bytes; 2823+ uint8x16_t tmp_a = vdupq_n_u8(0); 2824+ uint8x16_t tmp_c = vdupq_n_u8(0); 2825+ while (count >= STEP) 2826+ { 2827+ uint8x16_t qrp = vld1q_u8(rp); 2828+ uint8x16_t b = vld1q_u8(pp); 2829+ uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGB); 2830+ uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGB); 2831+ tmp_a = qrp; 2832+ tmp_c = b; 2833+ 2834+ int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))); 2835+ int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))); 2836+ int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))); 2837+ int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))); 2838+ int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c))); 2839+ int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c))); 2840+ 2841+ int16x8_t p = vsubq_s16(b_hign, c_hign); 2842+ int16x8_t pc = vsubq_s16(a_hign, c_hign); 2843+ int16x8_t pa = vabsq_s16(p); 2844+ int16x8_t pb = vabsq_s16(pc); 2845+ pc = vabsq_s16(vaddq_s16(p, pc)); 2846+ uint16x8_t p1_u = vcleq_s16(pa, pb); 2847+ uint16x8_t pa_u = vcleq_s16(pa, pc); 2848+ uint16x8_t pb_u = vcleq_s16(pb, pc); 2849+ p1_u = vandq_u16(p1_u, pa_u); 2850+ uint8x8_t d_hign = vmovn_u16(pb_u); 2851+ uint8x8_t e_hign = vmovn_u16(p1_u); 2852+ 2853+ p = vsubq_s16(b_low, c_low); 2854+ pc = vsubq_s16(a_low, c_low); 2855+ pa = vabsq_s16(p); 2856+ pb = vabsq_s16(pc); 2857+ pc = vabsq_s16(vaddq_s16(p, pc)); 2858+ p1_u = vcleq_s16(pa, pb); 2859+ pa_u = vcleq_s16(pa, pc); 2860+ pb_u = vcleq_s16(pb, pc); 2861+ p1_u = vandq_u16(p1_u, pa_u); 2862+ uint8x8_t d_low = vmovn_u16(pb_u); 2863+ uint8x8_t e_low = vmovn_u16(p1_u); 2864+ 2865+ uint8x16_t d = vcombine_u8(d_low, d_hign); 2866+ uint8x16_t e = vcombine_u8(e_low, e_hign); 2867+ d = vbslq_u8(d, b, c); 2868+ e = vbslq_u8(e, a, d); 2869+ 2870+ uint8x16_t qdp = vsubq_u8(qrp, e); 2871+ vst1q_u8(dp, qdp); 2872+ int8x16_t v_s = vreinterpretq_s8_u8(qdp); 2873+ v_s = vabsq_s8(v_s); 2874+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s); 2875+ sum += vaddlvq_u8(v_u); 2876+ 2877+ rp += STEP; 2878+ pp += STEP; 2879+ dp += STEP; 2880+ count -= STEP; 2881+ if (sum > lmins) 2882+ { 2883+ return sum; 2884+ } 2885+ } 2886+ 2887+ if (count == row_bytes) 2888+ { 2889+ dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff); 2890+ dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff); 2891+ dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff); 2892+ sum += MID - abs((int)dp[0] - MID); 2893+ sum += MID - abs((int)dp[1] - MID); 2894+ sum += MID - abs((int)dp[2] - MID); 2895+ rp += BYTE_RGB; 2896+ pp += BYTE_RGB; 2897+ dp += BYTE_RGB; 2898+ count -= BYTE_RGB; 2899+ } 2900+ 2901+ png_bytep cp = pp - BYTE_RGB; 2902+ png_bytep lp = rp - BYTE_RGB; 2903+ while (count > 0) 2904+ { 2905+ int a, b, c, pa, pb, pc, p; 2906+ 2907+ b = *pp++; 2908+ c = *cp++; 2909+ a = *lp++; 2910+ 2911+ p = b - c; 2912+ pc = a - c; 2913+ 2914+ pa = abs(p); 2915+ pb = abs(pc); 2916+ pc = abs(p + pc); 2917+ 2918+ p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c; 2919+ *dp = (png_byte)(((int)*rp++ - p) & 0xff); 2920+ 2921+ count--; 2922+ sum += MID - abs((int)*dp++ - MID); 2923+ if (sum > lmins) 2924+ { 2925+ return sum; 2926+ } 2927+ } 2928+ return sum; 2929+} 2930+ 2931+void png_write_filter_paeth3_neon_only(png_structrp png_ptr, 2932+ size_t row_bytes) 2933+{ 2934+ png_bytep rp = png_ptr->row_buf + 1; 2935+ png_bytep pp = png_ptr->prev_row + 1; 2936+ png_bytep dp = png_ptr->try_row + 1; 2937+ png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH; 2938+ 2939+ size_t count = row_bytes; 2940+ uint8x16_t tmp_a = vdupq_n_u8(0); 2941+ uint8x16_t tmp_c = vdupq_n_u8(0); 2942+ while (count >= STEP) 2943+ { 2944+ uint8x16_t qrp = vld1q_u8(rp); 2945+ uint8x16_t b = vld1q_u8(pp); 2946+ uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGB); 2947+ uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGB); 2948+ tmp_a = qrp; 2949+ tmp_c = b; 2950+ 2951+ int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))); 2952+ int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))); 2953+ int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))); 2954+ int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))); 2955+ int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c))); 2956+ int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c))); 2957+ 2958+ int16x8_t p = vsubq_s16(b_hign, c_hign); 2959+ int16x8_t pc = vsubq_s16(a_hign, c_hign); 2960+ int16x8_t pa = vabsq_s16(p); 2961+ int16x8_t pb = vabsq_s16(pc); 2962+ pc = vabsq_s16(vaddq_s16(p, pc)); 2963+ uint16x8_t p1_u = vcleq_s16(pa, pb); 2964+ uint16x8_t pa_u = vcleq_s16(pa, pc); 2965+ uint16x8_t pb_u = vcleq_s16(pb, pc); 2966+ p1_u = vandq_u16(p1_u, pa_u); 2967+ uint8x8_t d_hign = vmovn_u16(pb_u); 2968+ uint8x8_t e_hign = vmovn_u16(p1_u); 2969+ 2970+ p = vsubq_s16(b_low, c_low); 2971+ pc = vsubq_s16(a_low, c_low); 2972+ pa = vabsq_s16(p); 2973+ pb = vabsq_s16(pc); 2974+ pc = vabsq_s16(vaddq_s16(p, pc)); 2975+ p1_u = vcleq_s16(pa, pb); 2976+ pa_u = vcleq_s16(pa, pc); 2977+ pb_u = vcleq_s16(pb, pc); 2978+ p1_u = vandq_u16(p1_u, pa_u); 2979+ uint8x8_t d_low = vmovn_u16(pb_u); 2980+ uint8x8_t e_low = vmovn_u16(p1_u); 2981+ 2982+ uint8x16_t d = vcombine_u8(d_low, d_hign); 2983+ uint8x16_t e = vcombine_u8(e_low, e_hign); 2984+ d = vbslq_u8(d, b, c); 2985+ e = vbslq_u8(e, a, d); 2986+ 2987+ uint8x16_t qdp = vsubq_u8(qrp, e); 2988+ vst1q_u8(dp, qdp); 2989+ 2990+ rp += STEP; 2991+ pp += STEP; 2992+ dp += STEP; 2993+ count -= STEP; 2994+ } 2995+ 2996+ if (count == row_bytes) 2997+ { 2998+ dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff); 2999+ dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff); 3000+ dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff); 3001+ rp += BYTE_RGB; 3002+ pp += BYTE_RGB; 3003+ dp += BYTE_RGB; 3004+ count -= BYTE_RGB; 3005+ } 3006+ 3007+ png_bytep cp = pp - BYTE_RGB; 3008+ png_bytep lp = rp - BYTE_RGB; 3009+ while (count > 0) 3010+ { 3011+ int a, b, c, pa, pb, pc, p; 3012+ 3013+ b = *pp++; 3014+ c = *cp++; 3015+ a = *lp++; 3016+ 3017+ p = b - c; 3018+ pc = a - c; 3019+ 3020+ pa = abs(p); 3021+ pb = abs(pc); 3022+ pc = abs(p + pc); 3023+ 3024+ p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c; 3025+ *dp++ = (png_byte)(((int)*rp++ - p) & 0xff); 3026+ count--; 3027+ } 3028+} 3029+ 3030+size_t png_write_filter_paeth4_neon(png_structrp png_ptr, 3031+ size_t row_bytes, size_t lmins) 3032+{ 3033+ size_t sum = 0; 3034+ png_bytep rp = png_ptr->row_buf + 1; 3035+ png_bytep pp = png_ptr->prev_row + 1; 3036+ png_bytep dp = png_ptr->try_row + 1; 3037+ png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH; 3038+ 3039+ size_t count = row_bytes; 3040+ uint8x16_t tmp_a = vdupq_n_u8(0); 3041+ uint8x16_t tmp_c = vdupq_n_u8(0); 3042+ while (count >= STEP) 3043+ { 3044+ uint8x16_t qrp = vld1q_u8(rp); 3045+ uint8x16_t b = vld1q_u8(pp); 3046+ uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGBA); 3047+ uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGBA); 3048+ tmp_a = qrp; 3049+ tmp_c = b; 3050+ 3051+ int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))); 3052+ int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))); 3053+ int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))); 3054+ int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))); 3055+ int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c))); 3056+ int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c))); 3057+ 3058+ int16x8_t p = vsubq_s16(b_hign, c_hign); 3059+ int16x8_t pc = vsubq_s16(a_hign, c_hign); 3060+ int16x8_t pa = vabsq_s16(p); 3061+ int16x8_t pb = vabsq_s16(pc); 3062+ pc = vabsq_s16(vaddq_s16(p, pc)); 3063+ uint16x8_t p1_u = vcleq_s16(pa, pb); 3064+ uint16x8_t pa_u = vcleq_s16(pa, pc); 3065+ uint16x8_t pb_u = vcleq_s16(pb, pc); 3066+ p1_u = vandq_u16(p1_u, pa_u); 3067+ uint8x8_t d_hign = vmovn_u16(pb_u); 3068+ uint8x8_t e_hign = vmovn_u16(p1_u); 3069+ 3070+ p = vsubq_s16(b_low, c_low); 3071+ pc = vsubq_s16(a_low, c_low); 3072+ pa = vabsq_s16(p); 3073+ pb = vabsq_s16(pc); 3074+ pc = vabsq_s16(vaddq_s16(p, pc)); 3075+ p1_u = vcleq_s16(pa, pb); 3076+ pa_u = vcleq_s16(pa, pc); 3077+ pb_u = vcleq_s16(pb, pc); 3078+ p1_u = vandq_u16(p1_u, pa_u); 3079+ uint8x8_t d_low = vmovn_u16(pb_u); 3080+ uint8x8_t e_low = vmovn_u16(p1_u); 3081+ 3082+ uint8x16_t d = vcombine_u8(d_low, d_hign); 3083+ uint8x16_t e = vcombine_u8(e_low, e_hign); 3084+ d = vbslq_u8(d, b, c); 3085+ e = vbslq_u8(e, a, d); 3086+ 3087+ uint8x16_t qdp = vsubq_u8(qrp, e); 3088+ vst1q_u8(dp, qdp); 3089+ int8x16_t v_s = vreinterpretq_s8_u8(qdp); 3090+ v_s = vabsq_s8(v_s); 3091+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s); 3092+ sum += vaddlvq_u8(v_u); 3093+ 3094+ rp += STEP; 3095+ pp += STEP; 3096+ dp += STEP; 3097+ count -= STEP; 3098+ if (sum > lmins) 3099+ { 3100+ return sum; 3101+ } 3102+ } 3103+ 3104+ if (count == row_bytes) 3105+ { 3106+ dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff); 3107+ dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff); 3108+ dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff); 3109+ dp[3] = (png_byte)(((int)rp[3] - (int)pp[3]) & 0xff); 3110+ sum += MID - abs((int)dp[0] - MID); 3111+ sum += MID - abs((int)dp[1] - MID); 3112+ sum += MID - abs((int)dp[2] - MID); 3113+ sum += MID - abs((int)dp[3] - MID); 3114+ rp += BYTE_RGBA; 3115+ pp += BYTE_RGBA; 3116+ dp += BYTE_RGBA; 3117+ count -= BYTE_RGBA; 3118+ } 3119+ 3120+ png_bytep cp = pp - BYTE_RGBA; 3121+ png_bytep lp = rp - BYTE_RGBA; 3122+ while (count > 0) 3123+ { 3124+ int a, b, c, pa, pb, pc, p; 3125+ 3126+ b = *pp++; 3127+ c = *cp++; 3128+ a = *lp++; 3129+ 3130+ p = b - c; 3131+ pc = a - c; 3132+ 3133+ pa = abs(p); 3134+ pb = abs(pc); 3135+ pc = abs(p + pc); 3136+ 3137+ p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c; 3138+ *dp = (png_byte)(((int)*rp++ - p) & 0xff); 3139+ 3140+ count--; 3141+ sum += MID - abs((int)*dp++ - MID); 3142+ if (sum > lmins) 3143+ { 3144+ return sum; 3145+ } 3146+ } 3147+ return sum; 3148+} 3149+ 3150+void png_write_filter_paeth4_neon_only(png_structrp png_ptr, 3151+ size_t row_bytes) 3152+{ 3153+ png_bytep rp = png_ptr->row_buf + 1; 3154+ png_bytep pp = png_ptr->prev_row + 1; 3155+ png_bytep dp = png_ptr->try_row + 1; 3156+ png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH; 3157+ 3158+ size_t count = row_bytes; 3159+ uint8x16_t tmp_a = vdupq_n_u8(0); 3160+ uint8x16_t tmp_c = vdupq_n_u8(0); 3161+ while (count >= STEP) 3162+ { 3163+ uint8x16_t qrp = vld1q_u8(rp); 3164+ uint8x16_t b = vld1q_u8(pp); 3165+ uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGBA); 3166+ uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGBA); 3167+ tmp_a = qrp; 3168+ tmp_c = b; 3169+ 3170+ int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))); 3171+ int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))); 3172+ int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))); 3173+ int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))); 3174+ int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c))); 3175+ int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c))); 3176+ 3177+ int16x8_t p = vsubq_s16(b_hign, c_hign); 3178+ int16x8_t pc = vsubq_s16(a_hign, c_hign); 3179+ int16x8_t pa = vabsq_s16(p); 3180+ int16x8_t pb = vabsq_s16(pc); 3181+ pc = vabsq_s16(vaddq_s16(p, pc)); 3182+ uint16x8_t p1_u = vcleq_s16(pa, pb); 3183+ uint16x8_t pa_u = vcleq_s16(pa, pc); 3184+ uint16x8_t pb_u = vcleq_s16(pb, pc); 3185+ p1_u = vandq_u16(p1_u, pa_u); 3186+ uint8x8_t d_hign = vmovn_u16(pb_u); 3187+ uint8x8_t e_hign = vmovn_u16(p1_u); 3188+ 3189+ p = vsubq_s16(b_low, c_low); 3190+ pc = vsubq_s16(a_low, c_low); 3191+ pa = vabsq_s16(p); 3192+ pb = vabsq_s16(pc); 3193+ pc = vabsq_s16(vaddq_s16(p, pc)); 3194+ p1_u = vcleq_s16(pa, pb); 3195+ pa_u = vcleq_s16(pa, pc); 3196+ pb_u = vcleq_s16(pb, pc); 3197+ p1_u = vandq_u16(p1_u, pa_u); 3198+ uint8x8_t d_low = vmovn_u16(pb_u); 3199+ uint8x8_t e_low = vmovn_u16(p1_u); 3200+ 3201+ uint8x16_t d = vcombine_u8(d_low, d_hign); 3202+ uint8x16_t e = vcombine_u8(e_low, e_hign); 3203+ d = vbslq_u8(d, b, c); 3204+ e = vbslq_u8(e, a, d); 3205+ 3206+ uint8x16_t qdp = vsubq_u8(qrp, e); 3207+ vst1q_u8(dp, qdp); 3208+ 3209+ rp += STEP; 3210+ pp += STEP; 3211+ dp += STEP; 3212+ count -= STEP; 3213+ } 3214+ 3215+ if (count == row_bytes) 3216+ { 3217+ dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff); 3218+ dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff); 3219+ dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff); 3220+ dp[3] = (png_byte)(((int)rp[3] - (int)pp[3]) & 0xff); 3221+ rp += BYTE_RGBA; 3222+ pp += BYTE_RGBA; 3223+ dp += BYTE_RGBA; 3224+ count -= BYTE_RGBA; 3225+ } 3226+ 3227+ png_bytep cp = pp - BYTE_RGBA; 3228+ png_bytep lp = rp - BYTE_RGBA; 3229+ while (count > 0) 3230+ { 3231+ int a, b, c, pa, pb, pc, p; 3232+ 3233+ b = *pp++; 3234+ c = *cp++; 3235+ a = *lp++; 3236+ 3237+ p = b - c; 3238+ pc = a - c; 3239+ 3240+ pa = abs(p); 3241+ pb = abs(pc); 3242+ pc = abs(p + pc); 3243+ 3244+ p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c; 3245+ *dp++ = (png_byte)(((int)*rp++ - p) & 0xff); 3246+ count--; 3247+ } 3248+} 3249+#endif 3250+ 3251 static size_t /* PRIVATE */ 3252 png_setup_sub_row(png_structrp png_ptr, png_uint_32 bpp, 3253 size_t row_bytes, size_t lmins) 3254 { 3255+#ifdef PNG_WRITE_NEON_ENABLE 3256+ if (bpp == 3) 3257+ { 3258+ return png_write_filter_sub3_neon(png_ptr, row_bytes, lmins); 3259+ } 3260+ if (bpp == 4) 3261+ { 3262+ return png_write_filter_sub4_neon(png_ptr, row_bytes, lmins); 3263+ } 3264+#endif 3265 png_bytep rp, dp, lp; 3266 size_t i; 3267 size_t sum = 0; 3268@@ -2318,6 +3261,16 @@ static void /* PRIVATE */ 3269 png_setup_sub_row_only(png_structrp png_ptr, png_uint_32 bpp, 3270 size_t row_bytes) 3271 { 3272+#ifdef PNG_WRITE_NEON_ENABLE 3273+ if (bpp == 3) 3274+ { 3275+ return png_write_filter_sub3_neon_only(png_ptr, row_bytes); 3276+ } 3277+ if (bpp == 4) 3278+ { 3279+ return png_write_filter_sub4_neon_only(png_ptr, row_bytes); 3280+ } 3281+#endif 3282 png_bytep rp, dp, lp; 3283 size_t i; 3284 3285@@ -2339,6 +3292,9 @@ png_setup_sub_row_only(png_structrp png_ptr, png_uint_32 bpp, 3286 static size_t /* PRIVATE */ 3287 png_setup_up_row(png_structrp png_ptr, size_t row_bytes, size_t lmins) 3288 { 3289+#ifdef PNG_WRITE_NEON_ENABLE 3290+ return png_write_filter_up_neon(png_ptr, row_bytes, lmins); 3291+#endif 3292 png_bytep rp, dp, pp; 3293 size_t i; 3294 size_t sum = 0; 3295@@ -2366,6 +3322,9 @@ png_setup_up_row(png_structrp png_ptr, size_t row_bytes, size_t lmins) 3296 static void /* PRIVATE */ 3297 png_setup_up_row_only(png_structrp png_ptr, size_t row_bytes) 3298 { 3299+#ifdef PNG_WRITE_NEON_ENABLE 3300+ return png_write_filter_up_neon_only(png_ptr, row_bytes); 3301+#endif 3302 png_bytep rp, dp, pp; 3303 size_t i; 3304 3305@@ -2383,6 +3342,16 @@ static size_t /* PRIVATE */ 3306 png_setup_avg_row(png_structrp png_ptr, png_uint_32 bpp, 3307 size_t row_bytes, size_t lmins) 3308 { 3309+#ifdef PNG_WRITE_NEON_ENABLE 3310+ if (bpp == 3) 3311+ { 3312+ return png_write_filter_avg3_neon(png_ptr, row_bytes, lmins); 3313+ } 3314+ if (bpp == 4) 3315+ { 3316+ return png_write_filter_avg4_neon(png_ptr, row_bytes, lmins); 3317+ } 3318+#endif 3319 png_bytep rp, dp, pp, lp; 3320 png_uint_32 i; 3321 size_t sum = 0; 3322@@ -2423,6 +3392,16 @@ static void /* PRIVATE */ 3323 png_setup_avg_row_only(png_structrp png_ptr, png_uint_32 bpp, 3324 size_t row_bytes) 3325 { 3326+#ifdef PNG_WRITE_NEON_ENABLE 3327+ if (bpp == 3) 3328+ { 3329+ return png_write_filter_avg3_neon_only(png_ptr, row_bytes); 3330+ } 3331+ if (bpp == 4) 3332+ { 3333+ return png_write_filter_avg4_neon_only(png_ptr, row_bytes); 3334+ } 3335+#endif 3336 png_bytep rp, dp, pp, lp; 3337 png_uint_32 i; 3338 3339@@ -2445,6 +3424,16 @@ static size_t /* PRIVATE */ 3340 png_setup_paeth_row(png_structrp png_ptr, png_uint_32 bpp, 3341 size_t row_bytes, size_t lmins) 3342 { 3343+#ifdef PNG_WRITE_NEON_ENABLE 3344+ if (bpp == 3) 3345+ { 3346+ return png_write_filter_paeth3_neon(png_ptr, row_bytes, lmins); 3347+ } 3348+ if (bpp == 4) 3349+ { 3350+ return png_write_filter_paeth4_neon(png_ptr, row_bytes, lmins); 3351+ } 3352+#endif 3353 png_bytep rp, dp, pp, cp, lp; 3354 size_t i; 3355 size_t sum = 0; 3356@@ -2506,6 +3495,16 @@ static void /* PRIVATE */ 3357 png_setup_paeth_row_only(png_structrp png_ptr, png_uint_32 bpp, 3358 size_t row_bytes) 3359 { 3360+#ifdef PNG_WRITE_NEON_ENABLE 3361+ if (bpp == 3) 3362+ { 3363+ return png_write_filter_paeth3_neon_only(png_ptr, row_bytes); 3364+ } 3365+ if (bpp == 4) 3366+ { 3367+ return png_write_filter_paeth4_neon_only(png_ptr, row_bytes); 3368+ } 3369+#endif 3370 png_bytep rp, dp, pp, cp, lp; 3371 size_t i; 3372 3373@@ -2613,6 +3612,25 @@ png_write_find_filter(png_structrp png_ptr, png_row_infop row_info) 3374 */ 3375 png_bytep rp; 3376 size_t sum = 0; 3377+#ifdef PNG_WRITE_NEON_ENABLE 3378+ size_t bytes = row_info->rowbytes; 3379+ rp = row_buf + 1; 3380+ while (bytes >= STEP) 3381+ { 3382+ uint8x16_t v = vld1q_u8(rp); 3383+ int8x16_t v_s = vreinterpretq_s8_u8(v); 3384+ v_s = vabsq_s8(v_s); 3385+ v = vreinterpretq_u8_s8(v_s); 3386+ sum += vaddlvq_u8(v); 3387+ rp += STEP; 3388+ bytes -= STEP; 3389+ } 3390+ while (bytes > 0) 3391+ { 3392+ sum += 128 - abs((int)*rp++ - 128); 3393+ bytes--; 3394+ } 3395+#else 3396 size_t i; 3397 unsigned int v; 3398 3399@@ -2627,7 +3645,7 @@ png_write_find_filter(png_structrp png_ptr, png_row_infop row_info) 3400 #endif 3401 } 3402 } 3403- 3404+#endif 3405 mins = sum; 3406 } 3407 3408