Lines Matching full:8
21 // Read 8 Y, 4 U and 4 V from 422
24 "ld1 {v0.8b}, [%0], #8 \n" \
30 // Read 8 Y, 8 U and 8 V from 444
33 "ld1 {v0.8b}, [%0], #8 \n" \
35 "ld1 {v1.d}[0], [%1], #8 \n" \
37 "ld1 {v1.d}[1], [%2], #8 \n" \
38 "uaddlp v1.8h, v1.16b \n" \
39 "rshrn v1.8b, v1.8h, #1 \n"
41 // Read 8 Y, and set 4 U and 4 V to 128
44 "ld1 {v0.8b}, [%0], #8 \n" \
45 "movi v1.8b , #128 \n"
47 // Read 8 Y and 4 UV from NV12
50 "ld1 {v0.8b}, [%0], #8 \n" \
52 "ld1 {v2.8b}, [%1], #8 \n" \
53 "uzp1 v1.8b, v2.8b, v2.8b \n" \
54 "uzp2 v3.8b, v2.8b, v2.8b \n" \
57 // Read 8 Y and 4 VU from NV21
60 "ld1 {v0.8b}, [%0], #8 \n" \
62 "ld1 {v2.8b}, [%1], #8 \n" \
63 "uzp1 v3.8b, v2.8b, v2.8b \n" \
64 "uzp2 v1.8b, v2.8b, v2.8b \n" \
67 // Read 8 YUY2
70 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
71 "uzp2 v3.8b, v1.8b, v1.8b \n" \
72 "uzp1 v1.8b, v1.8b, v1.8b \n" \
75 // Read 8 UYVY
78 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
79 "orr v0.8b, v3.8b, v3.8b \n" \
80 "uzp1 v1.8b, v2.8b, v2.8b \n" \
81 "uzp2 v3.8b, v2.8b, v2.8b \n" \
85 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
86 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
87 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
89 "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
90 "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
93 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
94 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
95 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
100 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
101 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
103 "uxtl v2.8h, v2.8b \n" \
104 "uxtl v1.8h, v1.8b \n" /* Extract U */ \
105 "mul v3.8h, v1.8h, v27.8h \n" \
106 "mul v5.8h, v1.8h, v29.8h \n" \
107 "mul v6.8h, v2.8h, v30.8h \n" \
108 "mul v7.8h, v2.8h, v28.8h \n" \
109 "sqadd v6.8h, v6.8h, v5.8h \n" \
111 ".8h, v24.8h, v0.8h \n" /* B */ \
113 ".8h, v25.8h, v0.8h \n" /* G */ \
115 ".8h, v26.8h, v0.8h \n" /* R */ \
116 "sqadd " #vB ".8h, " #vB \
117 ".8h, v3.8h \n" /* B */ \
118 "sqsub " #vG ".8h, " #vG \
119 ".8h, v6.8h \n" /* G */ \
120 "sqadd " #vR ".8h, " #vR \
121 ".8h, v7.8h \n" /* R */ \
122 "sqshrun " #vB ".8b, " #vB \
123 ".8h, #6 \n" /* B */ \
124 "sqshrun " #vG ".8b, " #vG \
125 ".8h, #6 \n" /* G */ \
126 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
136 "movi v23.8b, #255 \n" /* A */ in I444ToARGBRow_NEON()
140 "subs %w4, %w4, #8 \n" in I444ToARGBRow_NEON()
142 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" in I444ToARGBRow_NEON()
166 "movi v23.8b, #255 \n" /* A */ in I422ToARGBRow_NEON()
170 "subs %w4, %w4, #8 \n" in I422ToARGBRow_NEON()
172 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" in I422ToARGBRow_NEON()
201 "ld1 {v23.8b}, [%3], #8 \n" in I422AlphaToARGBRow_NEON()
202 "subs %w5, %w5, #8 \n" in I422AlphaToARGBRow_NEON()
204 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" in I422AlphaToARGBRow_NEON()
229 "movi v20.8b, #255 \n" /* A */ in I422ToRGBARow_NEON()
233 "subs %w4, %w4, #8 \n" in I422ToRGBARow_NEON()
235 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" in I422ToRGBARow_NEON()
262 "subs %w4, %w4, #8 \n" in I422ToRGB24Row_NEON()
264 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" in I422ToRGB24Row_NEON()
281 "shll v0.8h, v22.8b, #8 \n" /* R */ \
282 "shll v21.8h, v21.8b, #8 \n" /* G */ \
283 "shll v20.8h, v20.8b, #8 \n" /* B */ \
284 "sri v0.8h, v21.8h, #5 \n" /* RG */ \
285 "sri v0.8h, v20.8h, #11 \n" /* RGB */
298 "subs %w4, %w4, #8 \n" in I422ToRGB565Row_NEON()
301 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. in I422ToRGB565Row_NEON()
318 "shll v0.8h, v23.8b, #8 \n" /* A */ \
319 "shll v22.8h, v22.8b, #8 \n" /* R */ \
320 "shll v21.8h, v21.8b, #8 \n" /* G */ \
321 "shll v20.8h, v20.8b, #8 \n" /* B */ \
322 "sri v0.8h, v22.8h, #1 \n" /* AR */ \
323 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
324 "sri v0.8h, v20.8h, #11 \n" /* ARGB */
334 "movi v23.8b, #255 \n" in I422ToARGB1555Row_NEON()
338 "subs %w4, %w4, #8 \n" in I422ToARGB1555Row_NEON()
341 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. in I422ToARGB1555Row_NEON()
358 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
359 "ushr v20.8b, v20.8b, #4 \n" /* B */ \
360 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
361 "ushr v22.8b, v22.8b, #4 \n" /* R */ \
362 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
363 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
364 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
379 "subs %w4, %w4, #8 \n" in I422ToARGB4444Row_NEON()
380 "movi v23.8b, #255 \n" in I422ToARGB4444Row_NEON()
383 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. in I422ToARGB4444Row_NEON()
402 "movi v23.8b, #255 \n" in I400ToARGBRow_NEON()
406 "subs %w2, %w2, #8 \n" in I400ToARGBRow_NEON()
408 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" in I400ToARGBRow_NEON()
424 "movi v23.8b, #255 \n" in J400ToARGBRow_NEON()
427 "ld1 {v20.8b}, [%0], #8 \n" in J400ToARGBRow_NEON()
428 "orr v21.8b, v20.8b, v20.8b \n" in J400ToARGBRow_NEON()
429 "orr v22.8b, v20.8b, v20.8b \n" in J400ToARGBRow_NEON()
430 "subs %w2, %w2, #8 \n" in J400ToARGBRow_NEON()
432 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" in J400ToARGBRow_NEON()
449 "movi v23.8b, #255 \n" in NV12ToARGBRow_NEON()
453 "subs %w3, %w3, #8 \n" in NV12ToARGBRow_NEON()
455 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" in NV12ToARGBRow_NEON()
477 "movi v23.8b, #255 \n" in NV21ToARGBRow_NEON()
481 "subs %w3, %w3, #8 \n" in NV21ToARGBRow_NEON()
483 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" in NV21ToARGBRow_NEON()
508 "subs %w3, %w3, #8 \n" in NV12ToRGB565Row_NEON()
511 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. in NV12ToRGB565Row_NEON()
532 "movi v23.8b, #255 \n" in YUY2ToARGBRow_NEON()
536 "subs %w2, %w2, #8 \n" in YUY2ToARGBRow_NEON()
538 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" in YUY2ToARGBRow_NEON()
558 "movi v23.8b, #255 \n" in UYVYToARGBRow_NEON()
562 "subs %w2, %w2, #8 \n" in UYVYToARGBRow_NEON()
564 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" in UYVYToARGBRow_NEON()
627 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
632 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 in CopyRow_NEON()
635 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 in CopyRow_NEON()
645 // SetRow writes 'count' bytes using an 8 bit value repeated.
687 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 in MirrorRow_NEON()
689 "st1 {v0.D}[0], [%1], #8 \n" in MirrorRow_NEON()
709 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 in MirrorUVRow_NEON()
710 "subs %w3, %w3, #8 \n" // 8 pixels per loop. in MirrorUVRow_NEON()
711 "rev64 v0.8b, v0.8b \n" in MirrorUVRow_NEON()
712 "rev64 v1.8b, v1.8b \n" in MirrorUVRow_NEON()
714 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 in MirrorUVRow_NEON()
716 "st1 {v1.8b}, [%2], #8 \n" in MirrorUVRow_NEON()
738 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 in ARGBMirrorRow_NEON()
740 "st1 {v0.D}[0], [%1], #8 \n" in ARGBMirrorRow_NEON()
752 "movi v4.8b, #255 \n" // Alpha in RGB24ToARGBRow_NEON()
755 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. in RGB24ToARGBRow_NEON()
756 "subs %w2, %w2, #8 \n" // 8 processed per loop. in RGB24ToARGBRow_NEON()
758 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels in RGB24ToARGBRow_NEON()
770 "movi v5.8b, #255 \n" // Alpha in RAWToARGBRow_NEON()
773 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b in RAWToARGBRow_NEON()
774 "subs %w2, %w2, #8 \n" // 8 processed per loop. in RAWToARGBRow_NEON()
775 "orr v3.8b, v1.8b, v1.8b \n" // move g in RAWToARGBRow_NEON()
776 "orr v4.8b, v0.8b, v0.8b \n" // move r in RAWToARGBRow_NEON()
778 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a in RAWToARGBRow_NEON()
792 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b in RAWToRGB24Row_NEON()
793 "subs %w2, %w2, #8 \n" // 8 processed per loop. in RAWToRGB24Row_NEON()
794 "orr v3.8b, v1.8b, v1.8b \n" // move g in RAWToRGB24Row_NEON()
795 "orr v4.8b, v0.8b, v0.8b \n" // move r in RAWToRGB24Row_NEON()
797 "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r in RAWToRGB24Row_NEON()
808 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
809 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
810 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
811 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
812 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
813 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
814 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
822 "movi v3.8b, #255 \n" // Alpha in RGB565ToARGBRow_NEON()
825 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. in RGB565ToARGBRow_NEON()
826 "subs %w2, %w2, #8 \n" // 8 processed per loop. in RGB565ToARGBRow_NEON()
829 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels in RGB565ToARGBRow_NEON()
840 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
841 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
842 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
844 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
845 "xtn2 v3.16b, v2.8h \n" \
847 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
848 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
861 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
862 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
863 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
865 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
866 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
880 "movi v3.8b, #255 \n" // Alpha in ARGB1555ToARGBRow_NEON()
883 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. in ARGB1555ToARGBRow_NEON()
884 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGB1555ToARGBRow_NEON()
887 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels in ARGB1555ToARGBRow_NEON()
898 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
899 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
915 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. in ARGB4444ToARGBRow_NEON()
916 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGB4444ToARGBRow_NEON()
919 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels in ARGB4444ToARGBRow_NEON()
933 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels in ARGBToRGB24Row_NEON()
934 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBToRGB24Row_NEON()
936 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. in ARGBToRGB24Row_NEON()
950 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a in ARGBToRAWRow_NEON()
951 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBToRAWRow_NEON()
952 "orr v4.8b, v2.8b, v2.8b \n" // mov g in ARGBToRAWRow_NEON()
953 "orr v5.8b, v1.8b, v1.8b \n" // mov b in ARGBToRAWRow_NEON()
955 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b in ARGBToRAWRow_NEON()
1006 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels in YUY2ToUV422Row_NEON()
1007 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. in YUY2ToUV422Row_NEON()
1009 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. in YUY2ToUV422Row_NEON()
1011 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. in YUY2ToUV422Row_NEON()
1029 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels in UYVYToUV422Row_NEON()
1030 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. in UYVYToUV422Row_NEON()
1032 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. in UYVYToUV422Row_NEON()
1034 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. in UYVYToUV422Row_NEON()
1054 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels in YUY2ToUVRow_NEON()
1055 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. in YUY2ToUVRow_NEON()
1057 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row in YUY2ToUVRow_NEON()
1058 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U in YUY2ToUVRow_NEON()
1059 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V in YUY2ToUVRow_NEON()
1061 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. in YUY2ToUVRow_NEON()
1063 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. in YUY2ToUVRow_NEON()
1085 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels in UYVYToUVRow_NEON()
1086 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. in UYVYToUVRow_NEON()
1088 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row in UYVYToUVRow_NEON()
1089 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U in UYVYToUVRow_NEON()
1090 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V in UYVYToUVRow_NEON()
1092 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. in UYVYToUVRow_NEON()
1094 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. in UYVYToUVRow_NEON()
1139 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys in I422ToYUY2Row_NEON()
1140 "orr v2.8b, v1.8b, v1.8b \n" in I422ToYUY2Row_NEON()
1142 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us in I422ToYUY2Row_NEON()
1144 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs in I422ToYUY2Row_NEON()
1147 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. in I422ToYUY2Row_NEON()
1167 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys in I422ToUYVYRow_NEON()
1168 "orr v3.8b, v2.8b, v2.8b \n" in I422ToUYVYRow_NEON()
1170 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us in I422ToUYVYRow_NEON()
1172 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs in I422ToUYVYRow_NEON()
1175 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. in I422ToUYVYRow_NEON()
1191 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels in ARGBToRGB565Row_NEON()
1192 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBToRGB565Row_NEON()
1195 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. in ARGBToRGB565Row_NEON()
1213 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels in ARGBToRGB565DitherRow_NEON()
1214 "subs %w3, %w3, #8 \n" // 8 processed per loop. in ARGBToRGB565DitherRow_NEON()
1215 "uqadd v20.8b, v20.8b, v1.8b \n" in ARGBToRGB565DitherRow_NEON()
1216 "uqadd v21.8b, v21.8b, v1.8b \n" in ARGBToRGB565DitherRow_NEON()
1217 "uqadd v22.8b, v22.8b, v1.8b \n" in ARGBToRGB565DitherRow_NEON()
1220 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. in ARGBToRGB565DitherRow_NEON()
1236 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels in ARGBToARGB1555Row_NEON()
1237 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBToARGB1555Row_NEON()
1240 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. in ARGBToARGB1555Row_NEON()
1257 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels in ARGBToARGB4444Row_NEON()
1258 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBToARGB4444Row_NEON()
1261 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. in ARGBToARGB4444Row_NEON()
1273 "movi v4.8b, #13 \n" // B * 0.1016 coefficient in ARGBToYRow_NEON()
1274 "movi v5.8b, #65 \n" // G * 0.5078 coefficient in ARGBToYRow_NEON()
1275 "movi v6.8b, #33 \n" // R * 0.2578 coefficient in ARGBToYRow_NEON()
1276 "movi v7.8b, #16 \n" // Add 16 constant in ARGBToYRow_NEON()
1279 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. in ARGBToYRow_NEON()
1280 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBToYRow_NEON()
1281 "umull v3.8h, v0.8b, v4.8b \n" // B in ARGBToYRow_NEON()
1282 "umlal v3.8h, v1.8b, v5.8b \n" // G in ARGBToYRow_NEON()
1283 "umlal v3.8h, v2.8b, v6.8b \n" // R in ARGBToYRow_NEON()
1284 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y in ARGBToYRow_NEON()
1285 "uqadd v0.8b, v0.8b, v7.8b \n" in ARGBToYRow_NEON()
1287 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. in ARGBToYRow_NEON()
1316 "movi v4.8b, #15 \n" // B * 0.11400 coefficient in ARGBToYJRow_NEON()
1317 "movi v5.8b, #75 \n" // G * 0.58700 coefficient in ARGBToYJRow_NEON()
1318 "movi v6.8b, #38 \n" // R * 0.29900 coefficient in ARGBToYJRow_NEON()
1321 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. in ARGBToYJRow_NEON()
1322 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBToYJRow_NEON()
1323 "umull v3.8h, v0.8b, v4.8b \n" // B in ARGBToYJRow_NEON()
1324 "umlal v3.8h, v1.8b, v5.8b \n" // G in ARGBToYJRow_NEON()
1325 "umlal v3.8h, v2.8b, v6.8b \n" // R in ARGBToYJRow_NEON()
1326 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y in ARGBToYJRow_NEON()
1328 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. in ARGBToYJRow_NEON()
1338 // 8x1 pixels.
1344 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient in ARGBToUV444Row_NEON()
1345 "movi v25.8b, #74 \n" // UG -0.5781 coefficient in ARGBToUV444Row_NEON()
1346 "movi v26.8b, #38 \n" // UR -0.2969 coefficient in ARGBToUV444Row_NEON()
1347 "movi v27.8b, #18 \n" // VB -0.1406 coefficient in ARGBToUV444Row_NEON()
1348 "movi v28.8b, #94 \n" // VG -0.7344 coefficient in ARGBToUV444Row_NEON()
1352 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. in ARGBToUV444Row_NEON()
1353 "subs %w3, %w3, #8 \n" // 8 processed per loop. in ARGBToUV444Row_NEON()
1354 "umull v4.8h, v0.8b, v24.8b \n" // B in ARGBToUV444Row_NEON()
1355 "umlsl v4.8h, v1.8b, v25.8b \n" // G in ARGBToUV444Row_NEON()
1356 "umlsl v4.8h, v2.8b, v26.8b \n" // R in ARGBToUV444Row_NEON()
1357 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned in ARGBToUV444Row_NEON()
1359 "umull v3.8h, v2.8b, v24.8b \n" // R in ARGBToUV444Row_NEON()
1360 "umlsl v3.8h, v1.8b, v28.8b \n" // G in ARGBToUV444Row_NEON()
1361 "umlsl v3.8h, v0.8b, v27.8b \n" // B in ARGBToUV444Row_NEON()
1362 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned in ARGBToUV444Row_NEON()
1364 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U in ARGBToUV444Row_NEON()
1365 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V in ARGBToUV444Row_NEON()
1368 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. in ARGBToUV444Row_NEON()
1370 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. in ARGBToUV444Row_NEON()
1383 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
1384 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
1385 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
1386 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
1387 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
1390 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1392 "mul v3.8h, " #QB \
1393 ",v20.8h \n" /* B */ \
1394 "mul v4.8h, " #QR \
1395 ",v20.8h \n" /* R */ \
1396 "mls v3.8h, " #QG \
1397 ",v21.8h \n" /* G */ \
1398 "mls v4.8h, " #QG \
1399 ",v24.8h \n" /* G */ \
1400 "mls v3.8h, " #QR \
1401 ",v22.8h \n" /* R */ \
1402 "mls v4.8h, " #QB \
1403 ",v23.8h \n" /* B */ \
1404 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
1405 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
1406 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
1407 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
1423 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. in ARGBToUVRow_NEON()
1424 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. in ARGBToUVRow_NEON()
1425 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. in ARGBToUVRow_NEON()
1429 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. in ARGBToUVRow_NEON()
1430 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. in ARGBToUVRow_NEON()
1431 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. in ARGBToUVRow_NEON()
1433 "urshr v0.8h, v0.8h, #1 \n" // 2x average in ARGBToUVRow_NEON()
1434 "urshr v1.8h, v1.8h, #1 \n" in ARGBToUVRow_NEON()
1435 "urshr v2.8h, v2.8h, #1 \n" in ARGBToUVRow_NEON()
1438 RGBTOUV(v0.8h, v1.8h, v2.8h) in ARGBToUVRow_NEON()
1440 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. in ARGBToUVRow_NEON()
1442 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. in ARGBToUVRow_NEON()
1463 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 in ARGBToUVJRow_NEON()
1464 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 in ARGBToUVJRow_NEON()
1465 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 in ARGBToUVJRow_NEON()
1466 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 in ARGBToUVJRow_NEON()
1467 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 in ARGBToUVJRow_NEON()
1472 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. in ARGBToUVJRow_NEON()
1473 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. in ARGBToUVJRow_NEON()
1474 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. in ARGBToUVJRow_NEON()
1477 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. in ARGBToUVJRow_NEON()
1478 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. in ARGBToUVJRow_NEON()
1479 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. in ARGBToUVJRow_NEON()
1481 "urshr v0.8h, v0.8h, #1 \n" // 2x average in ARGBToUVJRow_NEON()
1482 "urshr v1.8h, v1.8h, #1 \n" in ARGBToUVJRow_NEON()
1483 "urshr v2.8h, v2.8h, #1 \n" in ARGBToUVJRow_NEON()
1486 RGBTOUV(v0.8h, v1.8h, v2.8h) in ARGBToUVJRow_NEON()
1488 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. in ARGBToUVJRow_NEON()
1490 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. in ARGBToUVJRow_NEON()
1514 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. in BGRAToUVRow_NEON()
1515 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. in BGRAToUVRow_NEON()
1516 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. in BGRAToUVRow_NEON()
1519 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. in BGRAToUVRow_NEON()
1520 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. in BGRAToUVRow_NEON()
1521 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. in BGRAToUVRow_NEON()
1523 "urshr v0.8h, v0.8h, #1 \n" // 2x average in BGRAToUVRow_NEON()
1524 "urshr v1.8h, v3.8h, #1 \n" in BGRAToUVRow_NEON()
1525 "urshr v2.8h, v2.8h, #1 \n" in BGRAToUVRow_NEON()
1528 RGBTOUV(v0.8h, v1.8h, v2.8h) in BGRAToUVRow_NEON()
1530 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. in BGRAToUVRow_NEON()
1532 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. in BGRAToUVRow_NEON()
1556 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. in ABGRToUVRow_NEON()
1557 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. in ABGRToUVRow_NEON()
1558 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. in ABGRToUVRow_NEON()
1561 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. in ABGRToUVRow_NEON()
1562 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. in ABGRToUVRow_NEON()
1563 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. in ABGRToUVRow_NEON()
1565 "urshr v0.8h, v3.8h, #1 \n" // 2x average in ABGRToUVRow_NEON()
1566 "urshr v2.8h, v2.8h, #1 \n" in ABGRToUVRow_NEON()
1567 "urshr v1.8h, v1.8h, #1 \n" in ABGRToUVRow_NEON()
1570 RGBTOUV(v0.8h, v2.8h, v1.8h) in ABGRToUVRow_NEON()
1572 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. in ABGRToUVRow_NEON()
1574 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. in ABGRToUVRow_NEON()
1598 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. in RGBAToUVRow_NEON()
1599 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. in RGBAToUVRow_NEON()
1600 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. in RGBAToUVRow_NEON()
1603 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. in RGBAToUVRow_NEON()
1604 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. in RGBAToUVRow_NEON()
1605 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. in RGBAToUVRow_NEON()
1607 "urshr v0.8h, v0.8h, #1 \n" // 2x average in RGBAToUVRow_NEON()
1608 "urshr v1.8h, v1.8h, #1 \n" in RGBAToUVRow_NEON()
1609 "urshr v2.8h, v2.8h, #1 \n" in RGBAToUVRow_NEON()
1612 RGBTOUV(v0.8h, v1.8h, v2.8h) in RGBAToUVRow_NEON()
1614 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. in RGBAToUVRow_NEON()
1616 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. in RGBAToUVRow_NEON()
1640 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. in RGB24ToUVRow_NEON()
1641 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. in RGB24ToUVRow_NEON()
1642 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. in RGB24ToUVRow_NEON()
1645 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. in RGB24ToUVRow_NEON()
1646 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. in RGB24ToUVRow_NEON()
1647 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. in RGB24ToUVRow_NEON()
1649 "urshr v0.8h, v0.8h, #1 \n" // 2x average in RGB24ToUVRow_NEON()
1650 "urshr v1.8h, v1.8h, #1 \n" in RGB24ToUVRow_NEON()
1651 "urshr v2.8h, v2.8h, #1 \n" in RGB24ToUVRow_NEON()
1654 RGBTOUV(v0.8h, v1.8h, v2.8h) in RGB24ToUVRow_NEON()
1656 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. in RGB24ToUVRow_NEON()
1658 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. in RGB24ToUVRow_NEON()
1681 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. in RAWToUVRow_NEON()
1682 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. in RAWToUVRow_NEON()
1683 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. in RAWToUVRow_NEON()
1684 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. in RAWToUVRow_NEON()
1686 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels in RAWToUVRow_NEON()
1687 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. in RAWToUVRow_NEON()
1688 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. in RAWToUVRow_NEON()
1689 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. in RAWToUVRow_NEON()
1691 "urshr v2.8h, v2.8h, #1 \n" // 2x average in RAWToUVRow_NEON()
1692 "urshr v1.8h, v1.8h, #1 \n" in RAWToUVRow_NEON()
1693 "urshr v0.8h, v0.8h, #1 \n" in RAWToUVRow_NEON()
1696 RGBTOUV(v2.8h, v1.8h, v0.8h) in RAWToUVRow_NEON()
1698 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. in RAWToUVRow_NEON()
1700 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. in RAWToUVRow_NEON()
1713 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1721 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 in RGB565ToUVRow_NEON()
1722 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 in RGB565ToUVRow_NEON()
1723 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 in RGB565ToUVRow_NEON()
1724 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 in RGB565ToUVRow_NEON()
1725 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 in RGB565ToUVRow_NEON()
1729 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. in RGB565ToUVRow_NEON()
1731 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1732 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1733 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1735 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. in RGB565ToUVRow_NEON()
1737 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1738 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1739 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1742 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. in RGB565ToUVRow_NEON()
1744 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1745 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1746 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1748 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. in RGB565ToUVRow_NEON()
1750 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1751 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1752 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in RGB565ToUVRow_NEON()
1758 "urshr v4.8h, v16.8h, #1 \n" // 2x average in RGB565ToUVRow_NEON()
1759 "urshr v5.8h, v18.8h, #1 \n" in RGB565ToUVRow_NEON()
1760 "urshr v6.8h, v20.8h, #1 \n" in RGB565ToUVRow_NEON()
1763 "mul v16.8h, v4.8h, v22.8h \n" // B in RGB565ToUVRow_NEON()
1764 "mls v16.8h, v5.8h, v23.8h \n" // G in RGB565ToUVRow_NEON()
1765 "mls v16.8h, v6.8h, v24.8h \n" // R in RGB565ToUVRow_NEON()
1766 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned in RGB565ToUVRow_NEON()
1767 "mul v17.8h, v6.8h, v22.8h \n" // R in RGB565ToUVRow_NEON()
1768 "mls v17.8h, v5.8h, v26.8h \n" // G in RGB565ToUVRow_NEON()
1769 "mls v17.8h, v4.8h, v25.8h \n" // B in RGB565ToUVRow_NEON()
1770 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned in RGB565ToUVRow_NEON()
1771 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U in RGB565ToUVRow_NEON()
1772 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V in RGB565ToUVRow_NEON()
1774 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. in RGB565ToUVRow_NEON()
1776 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. in RGB565ToUVRow_NEON()
1790 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1801 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. in ARGB1555ToUVRow_NEON()
1803 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1804 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1805 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1807 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. in ARGB1555ToUVRow_NEON()
1809 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1810 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1811 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1814 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. in ARGB1555ToUVRow_NEON()
1816 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1817 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1818 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1820 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. in ARGB1555ToUVRow_NEON()
1822 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1823 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1824 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in ARGB1555ToUVRow_NEON()
1830 "urshr v4.8h, v16.8h, #1 \n" // 2x average in ARGB1555ToUVRow_NEON()
1831 "urshr v5.8h, v17.8h, #1 \n" in ARGB1555ToUVRow_NEON()
1832 "urshr v6.8h, v18.8h, #1 \n" in ARGB1555ToUVRow_NEON()
1835 "mul v2.8h, v4.8h, v20.8h \n" // B in ARGB1555ToUVRow_NEON()
1836 "mls v2.8h, v5.8h, v21.8h \n" // G in ARGB1555ToUVRow_NEON()
1837 "mls v2.8h, v6.8h, v22.8h \n" // R in ARGB1555ToUVRow_NEON()
1838 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned in ARGB1555ToUVRow_NEON()
1839 "mul v3.8h, v6.8h, v20.8h \n" // R in ARGB1555ToUVRow_NEON()
1840 "mls v3.8h, v5.8h, v24.8h \n" // G in ARGB1555ToUVRow_NEON()
1841 "mls v3.8h, v4.8h, v23.8h \n" // B in ARGB1555ToUVRow_NEON()
1842 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned in ARGB1555ToUVRow_NEON()
1843 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U in ARGB1555ToUVRow_NEON()
1844 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V in ARGB1555ToUVRow_NEON()
1846 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. in ARGB1555ToUVRow_NEON()
1848 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. in ARGB1555ToUVRow_NEON()
1862 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1873 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. in ARGB4444ToUVRow_NEON()
1875 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1876 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1877 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1879 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. in ARGB4444ToUVRow_NEON()
1881 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1882 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1883 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1886 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. in ARGB4444ToUVRow_NEON()
1888 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1889 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1890 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1892 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. in ARGB4444ToUVRow_NEON()
1894 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1895 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1896 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. in ARGB4444ToUVRow_NEON()
1902 "urshr v4.8h, v16.8h, #1 \n" // 2x average in ARGB4444ToUVRow_NEON()
1903 "urshr v5.8h, v17.8h, #1 \n" in ARGB4444ToUVRow_NEON()
1904 "urshr v6.8h, v18.8h, #1 \n" in ARGB4444ToUVRow_NEON()
1907 "mul v2.8h, v4.8h, v20.8h \n" // B in ARGB4444ToUVRow_NEON()
1908 "mls v2.8h, v5.8h, v21.8h \n" // G in ARGB4444ToUVRow_NEON()
1909 "mls v2.8h, v6.8h, v22.8h \n" // R in ARGB4444ToUVRow_NEON()
1910 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned in ARGB4444ToUVRow_NEON()
1911 "mul v3.8h, v6.8h, v20.8h \n" // R in ARGB4444ToUVRow_NEON()
1912 "mls v3.8h, v5.8h, v24.8h \n" // G in ARGB4444ToUVRow_NEON()
1913 "mls v3.8h, v4.8h, v23.8h \n" // B in ARGB4444ToUVRow_NEON()
1914 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned in ARGB4444ToUVRow_NEON()
1915 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U in ARGB4444ToUVRow_NEON()
1916 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V in ARGB4444ToUVRow_NEON()
1918 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. in ARGB4444ToUVRow_NEON()
1920 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. in ARGB4444ToUVRow_NEON()
1937 "movi v24.8b, #13 \n" // B * 0.1016 coefficient in RGB565ToYRow_NEON()
1938 "movi v25.8b, #65 \n" // G * 0.5078 coefficient in RGB565ToYRow_NEON()
1939 "movi v26.8b, #33 \n" // R * 0.2578 coefficient in RGB565ToYRow_NEON()
1940 "movi v27.8b, #16 \n" // Add 16 constant in RGB565ToYRow_NEON()
1943 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. in RGB565ToYRow_NEON()
1944 "subs %w2, %w2, #8 \n" // 8 processed per loop. in RGB565ToYRow_NEON()
1946 "umull v3.8h, v0.8b, v24.8b \n" // B in RGB565ToYRow_NEON()
1947 "umlal v3.8h, v1.8b, v25.8b \n" // G in RGB565ToYRow_NEON()
1948 "umlal v3.8h, v2.8b, v26.8b \n" // R in RGB565ToYRow_NEON()
1949 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y in RGB565ToYRow_NEON()
1950 "uqadd v0.8b, v0.8b, v27.8b \n" in RGB565ToYRow_NEON()
1952 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. in RGB565ToYRow_NEON()
1965 "movi v4.8b, #13 \n" // B * 0.1016 coefficient in ARGB1555ToYRow_NEON()
1966 "movi v5.8b, #65 \n" // G * 0.5078 coefficient in ARGB1555ToYRow_NEON()
1967 "movi v6.8b, #33 \n" // R * 0.2578 coefficient in ARGB1555ToYRow_NEON()
1968 "movi v7.8b, #16 \n" // Add 16 constant in ARGB1555ToYRow_NEON()
1971 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. in ARGB1555ToYRow_NEON()
1972 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGB1555ToYRow_NEON()
1974 "umull v3.8h, v0.8b, v4.8b \n" // B in ARGB1555ToYRow_NEON()
1975 "umlal v3.8h, v1.8b, v5.8b \n" // G in ARGB1555ToYRow_NEON()
1976 "umlal v3.8h, v2.8b, v6.8b \n" // R in ARGB1555ToYRow_NEON()
1977 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y in ARGB1555ToYRow_NEON()
1978 "uqadd v0.8b, v0.8b, v7.8b \n" in ARGB1555ToYRow_NEON()
1980 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. in ARGB1555ToYRow_NEON()
1992 "movi v24.8b, #13 \n" // B * 0.1016 coefficient in ARGB4444ToYRow_NEON()
1993 "movi v25.8b, #65 \n" // G * 0.5078 coefficient in ARGB4444ToYRow_NEON()
1994 "movi v26.8b, #33 \n" // R * 0.2578 coefficient in ARGB4444ToYRow_NEON()
1995 "movi v27.8b, #16 \n" // Add 16 constant in ARGB4444ToYRow_NEON()
1998 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. in ARGB4444ToYRow_NEON()
1999 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGB4444ToYRow_NEON()
2001 "umull v3.8h, v0.8b, v24.8b \n" // B in ARGB4444ToYRow_NEON()
2002 "umlal v3.8h, v1.8b, v25.8b \n" // G in ARGB4444ToYRow_NEON()
2003 "umlal v3.8h, v2.8b, v26.8b \n" // R in ARGB4444ToYRow_NEON()
2004 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y in ARGB4444ToYRow_NEON()
2005 "uqadd v0.8b, v0.8b, v27.8b \n" in ARGB4444ToYRow_NEON()
2007 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. in ARGB4444ToYRow_NEON()
2019 "movi v4.8b, #33 \n" // R * 0.2578 coefficient in BGRAToYRow_NEON()
2020 "movi v5.8b, #65 \n" // G * 0.5078 coefficient in BGRAToYRow_NEON()
2021 "movi v6.8b, #13 \n" // B * 0.1016 coefficient in BGRAToYRow_NEON()
2022 "movi v7.8b, #16 \n" // Add 16 constant in BGRAToYRow_NEON()
2025 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. in BGRAToYRow_NEON()
2026 "subs %w2, %w2, #8 \n" // 8 processed per loop. in BGRAToYRow_NEON()
2027 "umull v16.8h, v1.8b, v4.8b \n" // R in BGRAToYRow_NEON()
2028 "umlal v16.8h, v2.8b, v5.8b \n" // G in BGRAToYRow_NEON()
2029 "umlal v16.8h, v3.8b, v6.8b \n" // B in BGRAToYRow_NEON()
2030 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y in BGRAToYRow_NEON()
2031 "uqadd v0.8b, v0.8b, v7.8b \n" in BGRAToYRow_NEON()
2033 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. in BGRAToYRow_NEON()
2045 "movi v4.8b, #33 \n" // R * 0.2578 coefficient in ABGRToYRow_NEON()
2046 "movi v5.8b, #65 \n" // G * 0.5078 coefficient in ABGRToYRow_NEON()
2047 "movi v6.8b, #13 \n" // B * 0.1016 coefficient in ABGRToYRow_NEON()
2048 "movi v7.8b, #16 \n" // Add 16 constant in ABGRToYRow_NEON()
2051 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. in ABGRToYRow_NEON()
2052 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ABGRToYRow_NEON()
2053 "umull v16.8h, v0.8b, v4.8b \n" // R in ABGRToYRow_NEON()
2054 "umlal v16.8h, v1.8b, v5.8b \n" // G in ABGRToYRow_NEON()
2055 "umlal v16.8h, v2.8b, v6.8b \n" // B in ABGRToYRow_NEON()
2056 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y in ABGRToYRow_NEON()
2057 "uqadd v0.8b, v0.8b, v7.8b \n" in ABGRToYRow_NEON()
2059 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. in ABGRToYRow_NEON()
2071 "movi v4.8b, #13 \n" // B * 0.1016 coefficient in RGBAToYRow_NEON()
2072 "movi v5.8b, #65 \n" // G * 0.5078 coefficient in RGBAToYRow_NEON()
2073 "movi v6.8b, #33 \n" // R * 0.2578 coefficient in RGBAToYRow_NEON()
2074 "movi v7.8b, #16 \n" // Add 16 constant in RGBAToYRow_NEON()
2077 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. in RGBAToYRow_NEON()
2078 "subs %w2, %w2, #8 \n" // 8 processed per loop. in RGBAToYRow_NEON()
2079 "umull v16.8h, v1.8b, v4.8b \n" // B in RGBAToYRow_NEON()
2080 "umlal v16.8h, v2.8b, v5.8b \n" // G in RGBAToYRow_NEON()
2081 "umlal v16.8h, v3.8b, v6.8b \n" // R in RGBAToYRow_NEON()
2082 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y in RGBAToYRow_NEON()
2083 "uqadd v0.8b, v0.8b, v7.8b \n" in RGBAToYRow_NEON()
2085 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. in RGBAToYRow_NEON()
2097 "movi v4.8b, #13 \n" // B * 0.1016 coefficient in RGB24ToYRow_NEON()
2098 "movi v5.8b, #65 \n" // G * 0.5078 coefficient in RGB24ToYRow_NEON()
2099 "movi v6.8b, #33 \n" // R * 0.2578 coefficient in RGB24ToYRow_NEON()
2100 "movi v7.8b, #16 \n" // Add 16 constant in RGB24ToYRow_NEON()
2103 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. in RGB24ToYRow_NEON()
2104 "subs %w2, %w2, #8 \n" // 8 processed per loop. in RGB24ToYRow_NEON()
2105 "umull v16.8h, v0.8b, v4.8b \n" // B in RGB24ToYRow_NEON()
2106 "umlal v16.8h, v1.8b, v5.8b \n" // G in RGB24ToYRow_NEON()
2107 "umlal v16.8h, v2.8b, v6.8b \n" // R in RGB24ToYRow_NEON()
2108 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y in RGB24ToYRow_NEON()
2109 "uqadd v0.8b, v0.8b, v7.8b \n" in RGB24ToYRow_NEON()
2111 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. in RGB24ToYRow_NEON()
2123 "movi v4.8b, #33 \n" // R * 0.2578 coefficient in RAWToYRow_NEON()
2124 "movi v5.8b, #65 \n" // G * 0.5078 coefficient in RAWToYRow_NEON()
2125 "movi v6.8b, #13 \n" // B * 0.1016 coefficient in RAWToYRow_NEON()
2126 "movi v7.8b, #16 \n" // Add 16 constant in RAWToYRow_NEON()
2129 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. in RAWToYRow_NEON()
2130 "subs %w2, %w2, #8 \n" // 8 processed per loop. in RAWToYRow_NEON()
2131 "umull v16.8h, v0.8b, v4.8b \n" // B in RAWToYRow_NEON()
2132 "umlal v16.8h, v1.8b, v5.8b \n" // G in RAWToYRow_NEON()
2133 "umlal v16.8h, v2.8b, v6.8b \n" // R in RAWToYRow_NEON()
2134 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y in RAWToYRow_NEON()
2135 "uqadd v0.8b, v0.8b, v7.8b \n" in RAWToYRow_NEON()
2137 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. in RAWToYRow_NEON()
2171 "umull v2.8h, v0.8b, v4.8b \n" in InterpolateRow_NEON()
2172 "umull2 v3.8h, v0.16b, v4.16b \n" in InterpolateRow_NEON()
2173 "umlal v2.8h, v1.8b, v5.8b \n" in InterpolateRow_NEON()
2174 "umlal2 v3.8h, v1.16b, v5.16b \n" in InterpolateRow_NEON()
2175 "rshrn v0.8b, v2.8h, #8 \n" in InterpolateRow_NEON()
2176 "rshrn2 v0.16b, v3.8h, #8 \n" in InterpolateRow_NEON()
2222 "subs %w3, %w3, #8 \n" in ARGBBlendRow_NEON()
2224 // Blend 8 pixels. in ARGBBlendRow_NEON()
2225 "8: \n" in ARGBBlendRow_NEON()
2227 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels in ARGBBlendRow_NEON()
2229 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels in ARGBBlendRow_NEON()
2230 "subs %w3, %w3, #8 \n" // 8 processed per loop. in ARGBBlendRow_NEON()
2231 "umull v16.8h, v4.8b, v3.8b \n" // db * a in ARGBBlendRow_NEON()
2232 "umull v17.8h, v5.8b, v3.8b \n" // dg * a in ARGBBlendRow_NEON()
2233 "umull v18.8h, v6.8b, v3.8b \n" // dr * a in ARGBBlendRow_NEON()
2234 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 in ARGBBlendRow_NEON()
2235 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 in ARGBBlendRow_NEON()
2236 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 in ARGBBlendRow_NEON()
2237 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) in ARGBBlendRow_NEON()
2238 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) in ARGBBlendRow_NEON()
2239 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) in ARGBBlendRow_NEON()
2240 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb in ARGBBlendRow_NEON()
2241 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg in ARGBBlendRow_NEON()
2242 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr in ARGBBlendRow_NEON()
2243 "movi v3.8b, #255 \n" // a = 255 in ARGBBlendRow_NEON()
2245 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels in ARGBBlendRow_NEON()
2246 "b.ge 8b \n" in ARGBBlendRow_NEON()
2249 "adds %w3, %w3, #8-1 \n" in ARGBBlendRow_NEON()
2259 "umull v16.8h, v4.8b, v3.8b \n" // db * a in ARGBBlendRow_NEON()
2260 "umull v17.8h, v5.8b, v3.8b \n" // dg * a in ARGBBlendRow_NEON()
2261 "umull v18.8h, v6.8b, v3.8b \n" // dr * a in ARGBBlendRow_NEON()
2262 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 in ARGBBlendRow_NEON()
2263 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 in ARGBBlendRow_NEON()
2264 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 in ARGBBlendRow_NEON()
2265 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) in ARGBBlendRow_NEON()
2266 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) in ARGBBlendRow_NEON()
2267 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) in ARGBBlendRow_NEON()
2268 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb in ARGBBlendRow_NEON()
2269 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg in ARGBBlendRow_NEON()
2270 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr in ARGBBlendRow_NEON()
2271 "movi v3.8b, #255 \n" // a = 255 in ARGBBlendRow_NEON()
2288 // Attenuate 8 pixels at a time.
2291 // Attenuate 8 pixels. in ARGBAttenuateRow_NEON()
2294 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels in ARGBAttenuateRow_NEON()
2295 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBAttenuateRow_NEON()
2296 "umull v4.8h, v0.8b, v3.8b \n" // b * a in ARGBAttenuateRow_NEON()
2297 "umull v5.8h, v1.8b, v3.8b \n" // g * a in ARGBAttenuateRow_NEON()
2298 "umull v6.8h, v2.8b, v3.8b \n" // r * a in ARGBAttenuateRow_NEON()
2299 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 in ARGBAttenuateRow_NEON()
2300 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 in ARGBAttenuateRow_NEON()
2301 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 in ARGBAttenuateRow_NEON()
2303 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels in ARGBAttenuateRow_NEON()
2313 // Quantize 8 ARGB pixels (32 bytes).
2321 "dup v4.8h, %w2 \n" in ARGBQuantizeRow_NEON()
2322 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 in ARGBQuantizeRow_NEON()
2323 "dup v5.8h, %w3 \n" // interval multiply. in ARGBQuantizeRow_NEON()
2324 "dup v6.8h, %w4 \n" // interval add in ARGBQuantizeRow_NEON()
2326 // 8 pixel loop. in ARGBQuantizeRow_NEON()
2329 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. in ARGBQuantizeRow_NEON()
2330 "subs %w1, %w1, #8 \n" // 8 processed per loop. in ARGBQuantizeRow_NEON()
2331 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) in ARGBQuantizeRow_NEON()
2332 "uxtl v1.8h, v1.8b \n" in ARGBQuantizeRow_NEON()
2333 "uxtl v2.8h, v2.8b \n" in ARGBQuantizeRow_NEON()
2334 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale in ARGBQuantizeRow_NEON()
2335 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g in ARGBQuantizeRow_NEON()
2336 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r in ARGBQuantizeRow_NEON()
2337 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size in ARGBQuantizeRow_NEON()
2338 "mul v1.8h, v1.8h, v5.8h \n" // g in ARGBQuantizeRow_NEON()
2339 "mul v2.8h, v2.8h, v5.8h \n" // r in ARGBQuantizeRow_NEON()
2340 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset in ARGBQuantizeRow_NEON()
2341 "add v1.8h, v1.8h, v6.8h \n" // g in ARGBQuantizeRow_NEON()
2342 "add v2.8h, v2.8h, v6.8h \n" // r in ARGBQuantizeRow_NEON()
2343 "uqxtn v0.8b, v0.8h \n" in ARGBQuantizeRow_NEON()
2344 "uqxtn v1.8b, v1.8h \n" in ARGBQuantizeRow_NEON()
2345 "uqxtn v2.8b, v2.8h \n" in ARGBQuantizeRow_NEON()
2347 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels in ARGBQuantizeRow_NEON()
2358 // Shade 8 pixels at a time by specified value.
2359 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2367 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. in ARGBShadeRow_NEON()
2368 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. in ARGBShadeRow_NEON()
2370 // 8 pixel loop. in ARGBShadeRow_NEON()
2373 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. in ARGBShadeRow_NEON()
2374 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBShadeRow_NEON()
2375 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) in ARGBShadeRow_NEON()
2376 "uxtl v5.8h, v5.8b \n" in ARGBShadeRow_NEON()
2377 "uxtl v6.8h, v6.8b \n" in ARGBShadeRow_NEON()
2378 "uxtl v7.8h, v7.8b \n" in ARGBShadeRow_NEON()
2379 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 in ARGBShadeRow_NEON()
2380 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g in ARGBShadeRow_NEON()
2381 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r in ARGBShadeRow_NEON()
2382 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a in ARGBShadeRow_NEON()
2383 "uqxtn v4.8b, v4.8h \n" in ARGBShadeRow_NEON()
2384 "uqxtn v5.8b, v5.8h \n" in ARGBShadeRow_NEON()
2385 "uqxtn v6.8b, v6.8h \n" in ARGBShadeRow_NEON()
2386 "uqxtn v7.8b, v7.8h \n" in ARGBShadeRow_NEON()
2388 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels in ARGBShadeRow_NEON()
2398 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2403 "movi v24.8b, #15 \n" // B * 0.11400 coefficient in ARGBGrayRow_NEON()
2404 "movi v25.8b, #75 \n" // G * 0.58700 coefficient in ARGBGrayRow_NEON()
2405 "movi v26.8b, #38 \n" // R * 0.29900 coefficient in ARGBGrayRow_NEON()
2408 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. in ARGBGrayRow_NEON()
2409 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBGrayRow_NEON()
2410 "umull v4.8h, v0.8b, v24.8b \n" // B in ARGBGrayRow_NEON()
2411 "umlal v4.8h, v1.8b, v25.8b \n" // G in ARGBGrayRow_NEON()
2412 "umlal v4.8h, v2.8b, v26.8b \n" // R in ARGBGrayRow_NEON()
2413 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B in ARGBGrayRow_NEON()
2414 "orr v1.8b, v0.8b, v0.8b \n" // G in ARGBGrayRow_NEON()
2415 "orr v2.8b, v0.8b, v0.8b \n" // R in ARGBGrayRow_NEON()
2417 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. in ARGBGrayRow_NEON()
2427 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2434 "movi v20.8b, #17 \n" // BB coefficient in ARGBSepiaRow_NEON()
2435 "movi v21.8b, #68 \n" // BG coefficient in ARGBSepiaRow_NEON()
2436 "movi v22.8b, #35 \n" // BR coefficient in ARGBSepiaRow_NEON()
2437 "movi v24.8b, #22 \n" // GB coefficient in ARGBSepiaRow_NEON()
2438 "movi v25.8b, #88 \n" // GG coefficient in ARGBSepiaRow_NEON()
2439 "movi v26.8b, #45 \n" // GR coefficient in ARGBSepiaRow_NEON()
2440 "movi v28.8b, #24 \n" // BB coefficient in ARGBSepiaRow_NEON()
2441 "movi v29.8b, #98 \n" // BG coefficient in ARGBSepiaRow_NEON()
2442 "movi v30.8b, #50 \n" // BR coefficient in ARGBSepiaRow_NEON()
2445 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. in ARGBSepiaRow_NEON()
2446 "subs %w1, %w1, #8 \n" // 8 processed per loop. in ARGBSepiaRow_NEON()
2447 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B in ARGBSepiaRow_NEON()
2448 "umlal v4.8h, v1.8b, v21.8b \n" // G in ARGBSepiaRow_NEON()
2449 "umlal v4.8h, v2.8b, v22.8b \n" // R in ARGBSepiaRow_NEON()
2450 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G in ARGBSepiaRow_NEON()
2451 "umlal v5.8h, v1.8b, v25.8b \n" // G in ARGBSepiaRow_NEON()
2452 "umlal v5.8h, v2.8b, v26.8b \n" // R in ARGBSepiaRow_NEON()
2453 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R in ARGBSepiaRow_NEON()
2454 "umlal v6.8h, v1.8b, v29.8b \n" // G in ARGBSepiaRow_NEON()
2455 "umlal v6.8h, v2.8b, v30.8b \n" // R in ARGBSepiaRow_NEON()
2456 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B in ARGBSepiaRow_NEON()
2457 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G in ARGBSepiaRow_NEON()
2458 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R in ARGBSepiaRow_NEON()
2460 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. in ARGBSepiaRow_NEON()
2470 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2480 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. in ARGBColorMatrixRow_NEON()
2481 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. in ARGBColorMatrixRow_NEON()
2485 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. in ARGBColorMatrixRow_NEON()
2486 "subs %w2, %w2, #8 \n" // 8 processed per loop. in ARGBColorMatrixRow_NEON()
2487 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit in ARGBColorMatrixRow_NEON()
2488 "uxtl v17.8h, v17.8b \n" // g in ARGBColorMatrixRow_NEON()
2489 "uxtl v18.8h, v18.8b \n" // r in ARGBColorMatrixRow_NEON()
2490 "uxtl v19.8h, v19.8b \n" // a in ARGBColorMatrixRow_NEON()
2491 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B in ARGBColorMatrixRow_NEON()
2492 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G in ARGBColorMatrixRow_NEON()
2493 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R in ARGBColorMatrixRow_NEON()
2494 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A in ARGBColorMatrixRow_NEON()
2495 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B in ARGBColorMatrixRow_NEON()
2496 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G in ARGBColorMatrixRow_NEON()
2497 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R in ARGBColorMatrixRow_NEON()
2498 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A in ARGBColorMatrixRow_NEON()
2499 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B in ARGBColorMatrixRow_NEON()
2500 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G in ARGBColorMatrixRow_NEON()
2501 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R in ARGBColorMatrixRow_NEON()
2502 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A in ARGBColorMatrixRow_NEON()
2503 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B in ARGBColorMatrixRow_NEON()
2504 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G in ARGBColorMatrixRow_NEON()
2505 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R in ARGBColorMatrixRow_NEON()
2506 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A in ARGBColorMatrixRow_NEON()
2507 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B in ARGBColorMatrixRow_NEON()
2508 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G in ARGBColorMatrixRow_NEON()
2509 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R in ARGBColorMatrixRow_NEON()
2510 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A in ARGBColorMatrixRow_NEON()
2511 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B in ARGBColorMatrixRow_NEON()
2512 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G in ARGBColorMatrixRow_NEON()
2513 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R in ARGBColorMatrixRow_NEON()
2514 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A in ARGBColorMatrixRow_NEON()
2515 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B in ARGBColorMatrixRow_NEON()
2516 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G in ARGBColorMatrixRow_NEON()
2517 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R in ARGBColorMatrixRow_NEON()
2518 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A in ARGBColorMatrixRow_NEON()
2519 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B in ARGBColorMatrixRow_NEON()
2520 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G in ARGBColorMatrixRow_NEON()
2521 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R in ARGBColorMatrixRow_NEON()
2522 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A in ARGBColorMatrixRow_NEON()
2524 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. in ARGBColorMatrixRow_NEON()
2536 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
2542 // 8 pixel loop. in ARGBMultiplyRow_NEON()
2545 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. in ARGBMultiplyRow_NEON()
2547 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. in ARGBMultiplyRow_NEON()
2548 "subs %w3, %w3, #8 \n" // 8 processed per loop. in ARGBMultiplyRow_NEON()
2549 "umull v0.8h, v0.8b, v4.8b \n" // multiply B in ARGBMultiplyRow_NEON()
2550 "umull v1.8h, v1.8b, v5.8b \n" // multiply G in ARGBMultiplyRow_NEON()
2551 "umull v2.8h, v2.8b, v6.8b \n" // multiply R in ARGBMultiplyRow_NEON()
2552 "umull v3.8h, v3.8b, v7.8b \n" // multiply A in ARGBMultiplyRow_NEON()
2553 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B in ARGBMultiplyRow_NEON()
2554 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G in ARGBMultiplyRow_NEON()
2555 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R in ARGBMultiplyRow_NEON()
2556 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A in ARGBMultiplyRow_NEON()
2558 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels in ARGBMultiplyRow_NEON()
2570 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
2576 // 8 pixel loop. in ARGBAddRow_NEON()
2579 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. in ARGBAddRow_NEON()
2581 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. in ARGBAddRow_NEON()
2582 "subs %w3, %w3, #8 \n" // 8 processed per loop. in ARGBAddRow_NEON()
2583 "uqadd v0.8b, v0.8b, v4.8b \n" in ARGBAddRow_NEON()
2584 "uqadd v1.8b, v1.8b, v5.8b \n" in ARGBAddRow_NEON()
2585 "uqadd v2.8b, v2.8b, v6.8b \n" in ARGBAddRow_NEON()
2586 "uqadd v3.8b, v3.8b, v7.8b \n" in ARGBAddRow_NEON()
2588 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels in ARGBAddRow_NEON()
2600 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
2606 // 8 pixel loop. in ARGBSubtractRow_NEON()
2609 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. in ARGBSubtractRow_NEON()
2611 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. in ARGBSubtractRow_NEON()
2612 "subs %w3, %w3, #8 \n" // 8 processed per loop. in ARGBSubtractRow_NEON()
2613 "uqsub v0.8b, v0.8b, v4.8b \n" in ARGBSubtractRow_NEON()
2614 "uqsub v1.8b, v1.8b, v5.8b \n" in ARGBSubtractRow_NEON()
2615 "uqsub v2.8b, v2.8b, v6.8b \n" in ARGBSubtractRow_NEON()
2616 "uqsub v3.8b, v3.8b, v7.8b \n" in ARGBSubtractRow_NEON()
2618 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels in ARGBSubtractRow_NEON()
2640 "movi v3.8b, #255 \n" // alpha in SobelRow_NEON()
2641 // 8 pixel loop. in SobelRow_NEON()
2644 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. in SobelRow_NEON()
2646 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. in SobelRow_NEON()
2647 "subs %w3, %w3, #8 \n" // 8 processed per loop. in SobelRow_NEON()
2648 "uqadd v0.8b, v0.8b, v1.8b \n" // add in SobelRow_NEON()
2649 "orr v1.8b, v0.8b, v0.8b \n" in SobelRow_NEON()
2650 "orr v2.8b, v0.8b, v0.8b \n" in SobelRow_NEON()
2652 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels in SobelRow_NEON()
2699 "movi v3.8b, #255 \n" // alpha in SobelXYRow_NEON()
2700 // 8 pixel loop. in SobelXYRow_NEON()
2703 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. in SobelXYRow_NEON()
2705 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. in SobelXYRow_NEON()
2706 "subs %w3, %w3, #8 \n" // 8 processed per loop. in SobelXYRow_NEON()
2707 "uqadd v1.8b, v0.8b, v2.8b \n" // add in SobelXYRow_NEON()
2709 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels in SobelXYRow_NEON()
2732 "ld1 {v0.8b}, [%0],%5 \n" // top in SobelXRow_NEON()
2734 "ld1 {v1.8b}, [%0],%6 \n" in SobelXRow_NEON()
2735 "usubl v0.8h, v0.8b, v1.8b \n" in SobelXRow_NEON()
2737 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 in SobelXRow_NEON()
2739 "ld1 {v3.8b}, [%1],%6 \n" in SobelXRow_NEON()
2740 "usubl v1.8h, v2.8b, v3.8b \n" in SobelXRow_NEON()
2741 "add v0.8h, v0.8h, v1.8h \n" in SobelXRow_NEON()
2742 "add v0.8h, v0.8h, v1.8h \n" in SobelXRow_NEON()
2744 "ld1 {v2.8b}, [%2],%5 \n" // bottom in SobelXRow_NEON()
2746 "ld1 {v3.8b}, [%2],%6 \n" in SobelXRow_NEON()
2747 "subs %w4, %w4, #8 \n" // 8 pixels in SobelXRow_NEON()
2748 "usubl v1.8h, v2.8b, v3.8b \n" in SobelXRow_NEON()
2749 "add v0.8h, v0.8h, v1.8h \n" in SobelXRow_NEON()
2750 "abs v0.8h, v0.8h \n" in SobelXRow_NEON()
2751 "uqxtn v0.8b, v0.8h \n" in SobelXRow_NEON()
2753 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx in SobelXRow_NEON()
2777 "ld1 {v0.8b}, [%0],%4 \n" // left in SobelYRow_NEON()
2779 "ld1 {v1.8b}, [%1],%4 \n" in SobelYRow_NEON()
2780 "usubl v0.8h, v0.8b, v1.8b \n" in SobelYRow_NEON()
2782 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 in SobelYRow_NEON()
2784 "ld1 {v3.8b}, [%1],%4 \n" in SobelYRow_NEON()
2785 "usubl v1.8h, v2.8b, v3.8b \n" in SobelYRow_NEON()
2786 "add v0.8h, v0.8h, v1.8h \n" in SobelYRow_NEON()
2787 "add v0.8h, v0.8h, v1.8h \n" in SobelYRow_NEON()
2789 "ld1 {v2.8b}, [%0],%5 \n" // right in SobelYRow_NEON()
2791 "ld1 {v3.8b}, [%1],%5 \n" in SobelYRow_NEON()
2792 "subs %w3, %w3, #8 \n" // 8 pixels in SobelYRow_NEON()
2793 "usubl v1.8h, v2.8b, v3.8b \n" in SobelYRow_NEON()
2794 "add v0.8h, v0.8h, v1.8h \n" in SobelYRow_NEON()
2795 "abs v0.8h, v0.8h \n" in SobelYRow_NEON()
2796 "uqxtn v0.8b, v0.8h \n" in SobelYRow_NEON()
2798 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely in SobelYRow_NEON()
2815 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts in HalfFloat1Row_NEON()
2816 "subs %w2, %w2, #8 \n" // 8 pixels per loop in HalfFloat1Row_NEON()
2817 "uxtl v2.4s, v1.4h \n" // 8 int's in HalfFloat1Row_NEON()
2818 "uxtl2 v3.4s, v1.8h \n" in HalfFloat1Row_NEON()
2819 "scvtf v2.4s, v2.4s \n" // 8 floats in HalfFloat1Row_NEON()
2821 "fcvtn v1.4h, v2.4s \n" // 8 half floats in HalfFloat1Row_NEON()
2822 "fcvtn2 v1.8h, v3.4s \n" in HalfFloat1Row_NEON()
2824 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts in HalfFloat1Row_NEON()
2838 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts in HalfFloatRow_NEON()
2839 "subs %w2, %w2, #8 \n" // 8 pixels per loop in HalfFloatRow_NEON()
2840 "uxtl v2.4s, v1.4h \n" // 8 int's in HalfFloatRow_NEON()
2841 "uxtl2 v3.4s, v1.8h \n" in HalfFloatRow_NEON()
2842 "scvtf v2.4s, v2.4s \n" // 8 floats in HalfFloatRow_NEON()
2847 "uqshrn2 v1.8h, v3.4s, #13 \n" in HalfFloatRow_NEON()
2849 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts in HalfFloatRow_NEON()