1/* 2 * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22 23function ff_pix_abs16_neon, export=1 24 // x0 unused 25 // x1 uint8_t *pix1 26 // x2 uint8_t *pix2 27 // x3 ptrdiff_t stride 28 // w4 int h 29 cmp w4, #4 // if h < 4, jump to completion section 30 movi v18.4S, #0 // clear result accumulator 31 b.lt 2f 321: 33 ld1 {v0.16b}, [x1], x3 // load pix1 34 ld1 {v4.16b}, [x2], x3 // load pix2 35 ld1 {v1.16b}, [x1], x3 // load pix1 36 ld1 {v5.16b}, [x2], x3 // load pix2 37 uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate 38 uabdl2 v17.8h, v0.16b, v4.16b 39 ld1 {v2.16b}, [x1], x3 // load pix1 40 ld1 {v6.16b}, [x2], x3 // load pix2 41 uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate 42 uabal2 v17.8h, v1.16b, v5.16b 43 ld1 {v3.16b}, [x1], x3 44 ld1 {v7.16b}, [x2], x3 45 uabal v16.8h, v2.8b, v6.8b 46 uabal2 v17.8h, v2.16b, v6.16b 47 sub w4, w4, #4 // h -= 4 48 uabal v16.8h, v3.8b, v7.8b 49 uabal2 v17.8h, v3.16b, v7.16b 50 cmp w4, #4 // if h >= 4, loop 51 add v16.8h, v16.8h, v17.8h 52 uaddlv s16, v16.8h // add up everything in v16 accumulator 53 add d18, d16, d18 // add to the end result register 54 55 b.ge 1b 56 cbnz w4, 2f // if iterations remain, jump to completion section 57 58 fmov w0, s18 // copy result to general purpose register 59 ret 60 612: 62 ld1 {v0.16b}, [x1], x3 // load pix1 63 ld1 {v4.16b}, [x2], x3 // load pix2 64 uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate 65 uabal2 v16.8h, v0.16b, v4.16b 66 subs w4, w4, #1 // h -= 1 67 addv h16, v16.8h // add up v16 68 add d18, d16, d18 // add to result 69 b.ne 2b 70 71 fmov w0, s18 // copy result to general purpose register 72 ret 73endfunc 74 75function ff_pix_abs16_xy2_neon, export=1 76 // x0 unused 77 // x1 uint8_t *pix1 78 // x2 uint8_t *pix2 79 // x3 ptrdiff_t stride 80 // w4 int h 81 82 add x5, x2, x3 // use x5 to hold uint8_t *pix3 83 movi v0.2d, #0 // initialize the result register 84 85 // Load initial pix2 values for either the unrolled version or completion version. 86 ldur q4, [x2, #1] // load pix2+1 87 ldr q3, [x2] // load pix2 88 uaddl v2.8h, v4.8b, v3.8b // pix2 + pix2+1 0..7 89 uaddl2 v3.8h, v4.16b, v3.16b // pix2 + pix2+1 8..15 90 cmp w4, #4 // if h < 4 jump to the completion version 91 b.lt 2f 921: 93 // This is an unrolled implementation. It completes 4 iterations of the C for each branch. 94 // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration, 95 // plus two at the beginning to start. 96 ldur q5, [x5, #1] // load pix3+1 97 ld1 {v4.16b}, [x5], x3 // load pix3 98 ld1 {v1.16b}, [x1], x3 // load pix1 99 100 ldur q7, [x5, #1] // load pix3+1 101 ld1 {v6.16b}, [x5], x3 // load pix3 102 ld1 {v16.16b}, [x1], x3 // load pix1 103 104 ldur q19, [x5, #1] // load pix3+1 105 ld1 {v18.16b}, [x5], x3 // load pix3 106 ld1 {v17.16b}, [x1], x3 // load pix1 107 108 ldur q22, [x5, #1] // load pix3+1 109 ld1 {v21.16b}, [x5], x3 // load pix3 110 ld1 {v20.16b}, [x1], x3 // load pix1 111 112 // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1]) 113 uaddl v30.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7 114 uaddl2 v31.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15 115 add v23.8h, v2.8h, v30.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration 116 add v24.8h, v3.8h, v31.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration 117 rshrn v23.8b, v23.8h, #2 // shift right 2 0..7 (rounding shift right) 118 rshrn2 v23.16b, v24.8h, #2 // shift right 2 8..15 119 120 uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7 121 uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15 122 add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above 123 add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above 124 rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right) 125 rshrn2 v26.16b, v27.8h, #2 // shift right 2 8..15 126 127 uaddl v4.8h, v18.8b, v19.8b // pix3 + pix3+1 0..7 128 uaddl2 v5.8h, v18.16b, v19.16b // pix3 + pix3+1 8..15 129 add v28.8h, v2.8h, v4.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above 130 add v29.8h, v3.8h, v5.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above 131 rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right) 132 rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15 133 134 uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7 135 uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15 136 add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above 137 add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above 138 rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right) 139 rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15 140 141 // Averages are now stored in these registers: 142 // v23, v16, v28, v30 143 // pix1 values in these registers: 144 // v1, v16, v17, v20 145 // available: 146 // v4, v5, v7, v18, v19, v24, v25, v27, v29, v31 147 148 sub w4, w4, #4 // h -= 4 149 150 // Using absolute-difference instructions instead of absolute-difference-accumulate allows 151 // us to keep the results in 16b vectors instead of widening values with twice the instructions. 152 // This approach also has fewer data dependencies, allowing better instruction level parallelism. 153 uabd v4.16b, v1.16b, v23.16b // absolute difference 0..15, i=0 154 uabd v5.16b, v16.16b, v26.16b // absolute difference 0..15, i=1 155 uabd v6.16b, v17.16b, v28.16b // absolute difference 0..15, i=2 156 uabd v7.16b, v20.16b, v30.16b // absolute difference 0..15, i=3 157 158 cmp w4, #4 // loop if h >= 4 159 160 // Now add up all the values in each vector, v4-v7 with widening adds 161 uaddl v19.8h, v4.8b, v5.8b 162 uaddl2 v18.8h, v4.16b, v5.16b 163 uaddl v4.8h, v6.8b, v7.8b 164 uaddl2 v5.8h, v6.16b, v7.16b 165 add v4.8h, v4.8h, v5.8h 166 add v4.8h, v4.8h, v18.8h 167 add v4.8h, v4.8h, v19.8h 168 uaddlv s4, v4.8h // finish adding up accumulated values 169 add d0, d0, d4 // add the value to the top level accumulator 170 171 b.ge 1b 172 cbnz w4, 2f // if iterations remain jump to completion section 173 174 fmov w0, s0 // copy result to general purpose register 175 ret 1762: 177 // v2 and v3 are set either at the end of this loop or at from the unrolled version 178 // which branches here to complete iterations when h % 4 != 0. 179 ldur q5, [x5, #1] // load pix3+1 180 ld1 {v4.16b}, [x5], x3 // load pix3 181 ld1 {v1.16b}, [x1], x3 // load pix1 182 subs w4, w4, #1 // decrement h 183 184 uaddl v18.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7 185 uaddl2 v19.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15 186 add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration 187 add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration 188 // divide by 4 to compute the average of values summed above 189 urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right) 190 urshr v17.8h, v17.8h, #2 // shift right by 2 8..15 191 192 uxtl2 v8.8h, v1.16b // 8->16 bits pix1 8..15 193 uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7 194 195 uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7 196 uaba v6.8h, v8.8h, v17.8h // absolute difference accumulate 8..15 197 mov v2.16b, v18.16b // pix3 -> pix2 198 mov v3.16b, v19.16b // pix3+1 -> pix2+1 199 uaddlv s6, v6.8h // add up accumulator in v6 200 add d0, d0, d6 // add to the final result 201 202 b.ne 2b // loop if h > 0 203 fmov w0, s0 // copy result to general purpose register 204 ret 205endfunc 206