• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22
23function ff_pix_abs16_neon, export=1
24        // x0           unused
25        // x1           uint8_t *pix1
26        // x2           uint8_t *pix2
27        // x3           ptrdiff_t stride
28        // w4           int h
29        cmp             w4, #4                      // if h < 4, jump to completion section
30        movi            v18.4S, #0                  // clear result accumulator
31        b.lt            2f
321:
33        ld1             {v0.16b}, [x1], x3          // load pix1
34        ld1             {v4.16b}, [x2], x3          // load pix2
35        ld1             {v1.16b}, [x1], x3          // load pix1
36        ld1             {v5.16b}, [x2], x3          // load pix2
37        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
38        uabdl2          v17.8h, v0.16b, v4.16b
39        ld1             {v2.16b}, [x1], x3          // load pix1
40        ld1             {v6.16b}, [x2], x3          // load pix2
41        uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
42        uabal2          v17.8h, v1.16b, v5.16b
43        ld1             {v3.16b}, [x1], x3
44        ld1             {v7.16b}, [x2], x3
45        uabal           v16.8h, v2.8b, v6.8b
46        uabal2          v17.8h, v2.16b, v6.16b
47        sub             w4, w4, #4                  // h -= 4
48        uabal           v16.8h, v3.8b, v7.8b
49        uabal2          v17.8h, v3.16b, v7.16b
50        cmp             w4, #4                      // if h >= 4, loop
51        add             v16.8h, v16.8h, v17.8h
52        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
53        add             d18, d16, d18               // add to the end result register
54
55        b.ge            1b
56        cbnz            w4, 2f                      // if iterations remain, jump to completion section
57
58        fmov            w0, s18                     // copy result to general purpose register
59        ret
60
612:
62        ld1             {v0.16b}, [x1], x3          // load pix1
63        ld1             {v4.16b}, [x2], x3          // load pix2
64        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
65        uabal2          v16.8h, v0.16b, v4.16b
66        subs            w4, w4, #1                  // h -= 1
67        addv            h16, v16.8h                 // add up v16
68        add             d18, d16, d18               // add to result
69        b.ne            2b
70
71        fmov            w0, s18                     // copy result to general purpose register
72        ret
73endfunc
74
75function ff_pix_abs16_xy2_neon, export=1
76        // x0           unused
77        // x1           uint8_t *pix1
78        // x2           uint8_t *pix2
79        // x3           ptrdiff_t stride
80        // w4           int h
81
82        add             x5, x2, x3                  // use x5 to hold uint8_t *pix3
83        movi            v0.2d, #0                   // initialize the result register
84
85        // Load initial pix2 values for either the unrolled version or completion version.
86        ldur            q4, [x2, #1]                // load pix2+1
87        ldr             q3, [x2]                    // load pix2
88        uaddl           v2.8h, v4.8b, v3.8b         // pix2 + pix2+1 0..7
89        uaddl2          v3.8h, v4.16b, v3.16b       // pix2 + pix2+1 8..15
90        cmp             w4, #4                      // if h < 4 jump to the completion version
91        b.lt            2f
921:
93        // This is an unrolled implementation. It completes 4 iterations of the C for each branch.
94        // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
95        // plus two at the beginning to start.
96        ldur            q5, [x5, #1]                // load pix3+1
97        ld1             {v4.16b}, [x5], x3          // load pix3
98        ld1             {v1.16b}, [x1], x3          // load pix1
99
100        ldur            q7, [x5, #1]                // load pix3+1
101        ld1             {v6.16b}, [x5], x3          // load pix3
102        ld1             {v16.16b}, [x1], x3         // load pix1
103
104        ldur            q19, [x5, #1]               // load pix3+1
105        ld1             {v18.16b}, [x5], x3         // load pix3
106        ld1             {v17.16b}, [x1], x3         // load pix1
107
108        ldur            q22, [x5, #1]               // load pix3+1
109        ld1             {v21.16b}, [x5], x3         // load pix3
110        ld1             {v20.16b}, [x1], x3         // load pix1
111
112        // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
113        uaddl           v30.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
114        uaddl2          v31.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
115        add             v23.8h, v2.8h, v30.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
116        add             v24.8h, v3.8h, v31.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
117        rshrn           v23.8b, v23.8h, #2          // shift right 2 0..7 (rounding shift right)
118        rshrn2          v23.16b, v24.8h, #2         // shift right 2 8..15
119
120        uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
121        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
122        add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
123        add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above
124        rshrn           v26.8b, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
125        rshrn2          v26.16b, v27.8h, #2         // shift right 2 8..15
126
127        uaddl           v4.8h, v18.8b, v19.8b       // pix3 + pix3+1 0..7
128        uaddl2          v5.8h, v18.16b, v19.16b     // pix3 + pix3+1 8..15
129        add             v28.8h, v2.8h, v4.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
130        add             v29.8h, v3.8h, v5.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
131        rshrn           v28.8b, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
132        rshrn2          v28.16b, v29.8h, #2         // shift right 2 8..15
133
134        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
135        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
136        add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
137        add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
138        rshrn           v30.8b, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
139        rshrn2          v30.16b, v31.8h, #2         // shift right 2 8..15
140
141        // Averages are now stored in these registers:
142        // v23, v16, v28, v30
143        // pix1 values in these registers:
144        // v1, v16, v17, v20
145        // available:
146        // v4, v5, v7, v18, v19, v24, v25, v27, v29, v31
147
148        sub             w4, w4, #4                  // h -= 4
149
150        // Using absolute-difference instructions instead of absolute-difference-accumulate allows
151        // us to keep the results in 16b vectors instead of widening values with twice the instructions.
152        // This approach also has fewer data dependencies, allowing better instruction level parallelism.
153        uabd            v4.16b, v1.16b, v23.16b     // absolute difference 0..15, i=0
154        uabd            v5.16b, v16.16b, v26.16b    // absolute difference 0..15, i=1
155        uabd            v6.16b, v17.16b, v28.16b    // absolute difference 0..15, i=2
156        uabd            v7.16b, v20.16b, v30.16b    // absolute difference 0..15, i=3
157
158        cmp             w4, #4                      // loop if h >= 4
159
160        // Now add up all the values in each vector, v4-v7 with widening adds
161        uaddl           v19.8h, v4.8b, v5.8b
162        uaddl2          v18.8h, v4.16b, v5.16b
163        uaddl           v4.8h, v6.8b, v7.8b
164        uaddl2          v5.8h, v6.16b, v7.16b
165        add             v4.8h, v4.8h, v5.8h
166        add             v4.8h, v4.8h, v18.8h
167        add             v4.8h, v4.8h, v19.8h
168        uaddlv          s4, v4.8h                   // finish adding up accumulated values
169        add             d0, d0, d4                  // add the value to the top level accumulator
170
171        b.ge            1b
172        cbnz            w4, 2f                      // if iterations remain jump to completion section
173
174        fmov            w0, s0                      // copy result to general purpose register
175        ret
1762:
177        // v2 and v3 are set either at the end of this loop or at from the unrolled version
178        // which branches here to complete iterations when h % 4 != 0.
179        ldur            q5, [x5, #1]                // load pix3+1
180        ld1             {v4.16b}, [x5], x3          // load pix3
181        ld1             {v1.16b}, [x1], x3          // load pix1
182        subs            w4, w4, #1                  // decrement h
183
184        uaddl           v18.8h, v4.8b, v5.8b        // pix3 + pix3+1 0..7
185        uaddl2          v19.8h, v4.16b, v5.16b      // pix3 + pix3+1 8..15
186        add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
187        add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
188        // divide by 4 to compute the average of values summed above
189        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
190        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
191
192        uxtl2           v8.8h, v1.16b               // 8->16 bits pix1 8..15
193        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
194
195        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
196        uaba            v6.8h, v8.8h, v17.8h        // absolute difference accumulate 8..15
197        mov             v2.16b, v18.16b             // pix3 -> pix2
198        mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
199        uaddlv          s6, v6.8h                   // add up accumulator in v6
200        add             d0, d0, d6                  // add to the final result
201
202        b.ne            2b                          // loop if h > 0
203        fmov            w0, s0                      // copy result to general purpose register
204        ret
205endfunc
206