• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define PRIVATE(f) .text; .align 4; .type f,#function; f:
19#define END(f) .size f, .-f;
20
21//#define ARCH_ARM64_USE_BLUR_PRELOAD
22
23/* Number of fractional bits to preserve in intermediate results.  The
24 * intermediate storage is 16-bit, and we started with 8 bit data (the integer
25 * part), so this should be between 0 and 8.
26 */
27.set FRACTION_BITS, 7
28.set MAX_R, 25
29
30
31/* A quick way of making a line of code conditional on some other condition.
32 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
33 * `ifcc`:
34 */
35.macro ifcc zzz:vararg
36.if cc
37            \zzz
38.endif
39.endm
40
41/* It's not always clear that prefetching is beneficial and this needs further
42 * testing on different cores, so it's made switchable here.
43 */
44#if defined(ARCH_ARM64_USE_BLUR_PRELOAD)
45#define VERTPLD(...) prfm        PLDL1KEEP, [__VA_ARGS__]
46#else
47#define VERTPLD(...) nop
48#endif
49
50/* Fetch 16 columns of bytes (regardless of image format), convolve these
51 * vertically, and leave them in the register file.  If working near the top or
52 * bottom of an image then clamp the addressing while loading the data in.
53 *
54 * The convolution is fully unrolled for windows up to max_r, with the
55 * outermost edges calculated first.  This way it's possible to branch directly
56 * into the relevant part of the code for an arbitrary convolution radius.  Two
57 * variants of the loop are produced; one eliminates the clamping code for a
58 * slight speed advantage.
59 *
60 * Where the macro is called with reg=x, the specified register is taken to
61 * contain a pre-calculated pointer into one of the two loops.
62 *
63 * Input:
64 *      x1 -- src
65 *      x2 -- pitch
66 *      x5 -- r
67 *      x6 -- rup (r, unless clipped to top of source image)
68 *      x7 -- rdn (r, unless clipped to bottom of source image)
69 *      x12 -- switch index
70 *      v0-v3 -- coefficient table
71 *      x13 = -pitch
72 *      x15 = top-row in
73 *      x19 = bottom-row in
74 * Output:
75 *      x1 += 16
76 *      v10,v11 -- 16 convolved columns
77 * Modifies:
78 *      x10 = upper row pointer
79 *      x11 = lower row pointer
80 *      v12-v15 = temporary sums
81 */
82.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
83  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
84
85            ld1         {v15.16b}, [x1], #16
86            mov         x10, x15
87
88            uxtl        v14.8h, v15.8b
89            VERTPLD(x1, #16)
90            uxtl2       v15.8h, v15.16b
91  .if \max_r < 16 // approximate
92    ifcc    adr         \reg, 1f
93  .else
94    ifcc    adrp        \reg, 1f
95    ifcc    add         \reg, \reg, #:lo12:1f
96  .endif
97
98            umull       v12.4s, v14.4h, v0.h[0]
99    ifcc    sub         \reg, \reg, x5, LSL #6
100            umull2      v13.4s, v14.8h, v0.h[0]
101            mov         x11, x19
102            umull       v14.4s, v15.4h, v0.h[0]
103    ifcc    add         \reg, \reg, x5, LSL #3
104            umull2      v15.4s, v15.8h, v0.h[0]
105            br          \reg
106
107  /* This version of the vertical fetch loop body is used away from the edges
108   * of the source image.  The pointers start at the top and bottom source rows
109   * and work their way towards the centre on each iteration.  This way the
110   * number of taps used can be controlled by jumping directly into the middle
111   * of the loop and running to completion.
112   * If the loop body changes size then the code which caculates the address of
113   * the initial iteration must be updated to accordingly.
114   */
115  .macro vertfetch_noclamp i, dreg
116    .if 0 < \i && \i <= \max_r
117            ld1         {v10.16b}, [x10], x2
118            ld1         {v11.16b}, [x11], x13
119            uaddl       v16.8h, v10.8b, v11.8b
120            uaddl2      v11.8h, v10.16b, v11.16b
121            umlal       v12.4s, v16.4h, \dreg
122            umlal2      v13.4s, v16.8h, \dreg
123            VERTPLD(x10, #32)
124            umlal       v14.4s, v11.4h, \dreg
125            VERTPLD(x11, #32)
126            umlal2      v15.4s, v11.8h, \dreg
127    .endif
128  .endm
129
130  /* This version of the vertical fetch loop body is used near the edges of the
131   * source image, where one or both of the accesses may start with a clamped
132   * value, and the row addresses only begin to change after some number of
133   * iterations before the end.
134   * If the loop body changes size then the code which caculates the address of
135   * the initial iteration must be updated to accordingly.
136   */
137  .macro vertfetch_clamped i, dreg
138    .if 0 < \i && \i <= \max_r
139            ld1         {v10.16b}, [x10], x2
140            cmp         x6, #\i
141            ld1         {v11.16b}, [x11], x13
142            csel        x10, x15, x10, lo
143            uaddl       v16.8h, v10.8b, v11.8b
144            cmp         x7, #\i
145            uaddl2      v11.8h, v10.16b, v11.16b
146            csel        x11, x19, x11, lo
147            umlal       v12.4s, v16.4h, \dreg
148            umlal2      v13.4s, v16.8h, \dreg
149            VERTPLD(x10, #32)
150            umlal       v14.4s, v11.4h, \dreg
151            VERTPLD(x11, #32)
152            umlal2      v15.4s, v11.8h, \dreg
153    .endif
154  .endm
155
156  /* Entry into this unrolled loop is computed as a negative index from
157   * \labelc at the end of the block.
158   */
159  .align 4
160  vertfetch_clamped 27, v3.h[3]
161  vertfetch_clamped 26, v3.h[2]
162  vertfetch_clamped 25, v3.h[1]
163  vertfetch_clamped 24, v3.h[0]
164  vertfetch_clamped 23, v2.h[7]
165  vertfetch_clamped 22, v2.h[6]
166  vertfetch_clamped 21, v2.h[5]
167  vertfetch_clamped 20, v2.h[4]
168  vertfetch_clamped 19, v2.h[3]
169  vertfetch_clamped 18, v2.h[2]
170  vertfetch_clamped 17, v2.h[1]
171  vertfetch_clamped 16, v2.h[0]
172  vertfetch_clamped 15, v1.h[7]
173  vertfetch_clamped 14, v1.h[6]
174  vertfetch_clamped 13, v1.h[5]
175  vertfetch_clamped 12, v1.h[4]
176  vertfetch_clamped 11, v1.h[3]
177  vertfetch_clamped 10, v1.h[2]
178  vertfetch_clamped  9, v1.h[1]
179  vertfetch_clamped  8, v1.h[0]
180  vertfetch_clamped  7, v0.h[7]
181  vertfetch_clamped  6, v0.h[6]
182  vertfetch_clamped  5, v0.h[5]
183  vertfetch_clamped  4, v0.h[4]
184  vertfetch_clamped  3, v0.h[3]
185  vertfetch_clamped  2, v0.h[2]
186  vertfetch_clamped  1, v0.h[1]
187  vertfetch_clamped  0, v0.h[0]
188  1:
189  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
190
191  /* Entry into this unrolled loop is computed as a negative index from
192   * \labelnc at the end of the block.
193   */
194  .align 4
195  vertfetch_noclamp 27, v3.h[3]
196  vertfetch_noclamp 26, v3.h[2]
197  vertfetch_noclamp 25, v3.h[1]
198  vertfetch_noclamp 24, v3.h[0]
199  vertfetch_noclamp 23, v2.h[7]
200  vertfetch_noclamp 22, v2.h[6]
201  vertfetch_noclamp 21, v2.h[5]
202  vertfetch_noclamp 20, v2.h[4]
203  vertfetch_noclamp 19, v2.h[3]
204  vertfetch_noclamp 18, v2.h[2]
205  vertfetch_noclamp 17, v2.h[1]
206  vertfetch_noclamp 16, v2.h[0]
207  vertfetch_noclamp 15, v1.h[7]
208  vertfetch_noclamp 14, v1.h[6]
209  vertfetch_noclamp 13, v1.h[5]
210  vertfetch_noclamp 12, v1.h[4]
211  vertfetch_noclamp 11, v1.h[3]
212  vertfetch_noclamp 10, v1.h[2]
213  vertfetch_noclamp  9, v1.h[1]
214  vertfetch_noclamp  8, v1.h[0]
215  vertfetch_noclamp  7, v0.h[7]
216  vertfetch_noclamp  6, v0.h[6]
217  vertfetch_noclamp  5, v0.h[5]
218  vertfetch_noclamp  4, v0.h[4]
219  vertfetch_noclamp  3, v0.h[3]
220  vertfetch_noclamp  2, v0.h[2]
221  vertfetch_noclamp  1, v0.h[1]
222  vertfetch_noclamp  0, v0.h[0]
223  \labelnc :
224
225  .purgem vertfetch_clamped
226  .purgem vertfetch_noclamp
227
228  2:        uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
229            add         x15, x15, #16
230            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
231            add         x19, x19, #16
232            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
233            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
234.endm /*}}}*/
235
236/* Some portion of the convolution window (as much as will fit, and all of it
237 * for the uchar1 cases) is kept in the register file to avoid unnecessary
238 * memory accesses.  This forces the horizontal loops to be unrolled because
239 * there's no indexed addressing into the register file.
240 *
241 * As in the fetch macro, the operations are ordered from outside to inside, so
242 * that jumping into the middle of the block bypasses the unwanted window taps.
243 *
244 * There are several variants of the macro because of the fixed offets of the
245 * taps -- the wider the maximum radius the further the centre tap is from the
246 * most recently fetched data.  This means that pre-filling the window requires
247 * more data that won't be used and it means that rotating the window involves
248 * more mov operations.
249 *
250 * When the buffer gets too big the buffer at [x9] is used.
251 *
252 * Input:
253 *      v16-v31,v4-v11 -- convoltion window
254 *      x9 -- pointer to additional convolution window data
255 * Output:
256 *      x9 -- updated buffer pointer (if used)
257 *      d31 -- result to be stored
258 * Modifies:
259 *      x12 -- temp buffer pointer
260 *      v12-v13 -- temporaries for load and vext operations.
261 *      v14-v15 -- intermediate sums
262 */
263#define TUNED_LIST1 8, 16
264.macro hconv1_8/*{{{*/
265            umull       v14.4s, v9.4h, v0.h[0]
266            umull2      v15.4s, v9.8h, v0.h[0]
267
268            adr         x16, 100f
269            ldrsh       x12, [x16, x5, LSL #1]
270            add         x12, x12, x16
271            br          x12
272   100:     .hword -4
273            .hword 101f-100b
274            .hword 102f-100b
275            .hword 103f-100b
276            .hword 104f-100b
277            .hword 105f-100b
278            .hword 106f-100b
279            .hword 107f-100b
280            .hword 108f-100b
281            .align      4
282    108:    umlal       v14.4s, v8.4h, v1.h[0]
283            umlal2      v15.4s, v8.8h, v1.h[0]
284            umlal       v14.4s, v10.4h, v1.h[0]
285            umlal2      v15.4s, v10.8h, v1.h[0]
286    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
287            ext         v13.16b, v9.16b, v10.16b, #7*2
288            umlal       v14.4s, v12.4h, v0.h[7]
289            umlal2      v15.4s, v12.8h, v0.h[7]
290            umlal       v14.4s, v13.4h, v0.h[7]
291            umlal2      v15.4s, v13.8h, v0.h[7]
292    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
293            ext         v13.16b, v9.16b, v10.16b, #6*2
294            umlal       v14.4s, v12.4h, v0.h[6]
295            umlal2      v15.4s, v12.8h, v0.h[6]
296            umlal       v14.4s, v13.4h, v0.h[6]
297            umlal2      v15.4s, v13.8h, v0.h[6]
298    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
299            ext         v13.16b, v9.16b, v10.16b, #5*2
300            umlal       v14.4s, v12.4h, v0.h[5]
301            umlal2      v15.4s, v12.8h, v0.h[5]
302            umlal       v14.4s, v13.4h, v0.h[5]
303            umlal2      v15.4s, v13.8h, v0.h[5]
304    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
305            //ext         v13.16b, v9.16b, v10.16b, #4*2
306            umlal2      v14.4s, v8.8h, v0.h[4]
307            umlal       v15.4s, v9.4h, v0.h[4]
308            umlal2      v14.4s, v9.8h, v0.h[4]
309            umlal       v15.4s, v10.4h, v0.h[4]
310    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
311            ext         v13.16b, v9.16b, v10.16b, #3*2
312            umlal       v14.4s, v12.4h, v0.h[3]
313            umlal2      v15.4s, v12.8h, v0.h[3]
314            umlal       v14.4s, v13.4h, v0.h[3]
315            umlal2      v15.4s, v13.8h, v0.h[3]
316    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
317            ext         v13.16b, v9.16b, v10.16b, #2*2
318            umlal       v14.4s, v12.4h, v0.h[2]
319            umlal2      v15.4s, v12.8h, v0.h[2]
320            umlal       v14.4s, v13.4h, v0.h[2]
321            umlal2      v15.4s, v13.8h, v0.h[2]
322    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
323            ext         v13.16b, v9.16b, v10.16b, #1*2
324            umlal       v14.4s, v12.4h, v0.h[1]
325            umlal2      v15.4s, v12.8h, v0.h[1]
326            umlal       v14.4s, v13.4h, v0.h[1]
327            umlal2      v15.4s, v13.8h, v0.h[1]
328
329            uqrshrn     v14.4h, v14.4s, #16
330            uqrshrn2    v14.8h, v15.4s, #16
331            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
332
333            mov         v8.16b, v9.16b
334            mov         v9.16b, v10.16b
335            mov         v10.16b, v11.16b
336.endm/*}}}*/
337
338.macro hconv1_16/*{{{*/
339            umull       v14.4s, v8.4h, v0.h[0]
340            umull2      v15.4s, v8.8h, v0.h[0]
341
342            adr         x16, 100f
343            ldrsh       x12, [x16, x5, LSL #1]
344            add         x12, x12, x16
345            br          x12
346   100:     .hword -4
347            .hword 101f-100b
348            .hword 102f-100b
349            .hword 103f-100b
350            .hword 104f-100b
351            .hword 105f-100b
352            .hword 106f-100b
353            .hword 107f-100b
354            .hword 108f-100b
355            .hword 109f-100b
356            .hword 110f-100b
357            .hword 111f-100b
358            .hword 112f-100b
359            .hword 113f-100b
360            .hword 114f-100b
361            .hword 115f-100b
362            .hword 116f-100b
363            .align 4
364    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
365            //ext         v13.16b, v10.16b, v11.16b, #0*2
366            umlal       v14.4s, v6.4h, v2.h[0]
367            umlal2      v15.4s, v6.8h, v2.h[0]
368            umlal       v14.4s, v10.4h, v2.h[0]
369            umlal2      v15.4s, v10.8h, v2.h[0]
370    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
371            ext         v13.16b, v9.16b, v10.16b, #7*2
372            umlal       v14.4s, v12.4h, v1.h[7]
373            umlal2      v15.4s, v12.8h, v1.h[7]
374            umlal       v14.4s, v13.4h, v1.h[7]
375            umlal2      v15.4s, v13.8h, v1.h[7]
376    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
377            ext         v13.16b, v9.16b, v10.16b, #6*2
378            umlal       v14.4s, v12.4h, v1.h[6]
379            umlal2      v15.4s, v12.8h, v1.h[6]
380            umlal       v14.4s, v13.4h, v1.h[6]
381            umlal2      v15.4s, v13.8h, v1.h[6]
382    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
383            ext         v13.16b, v9.16b, v10.16b, #5*2
384            umlal       v14.4s, v12.4h, v1.h[5]
385            umlal2      v15.4s, v12.8h, v1.h[5]
386            umlal       v14.4s, v13.4h, v1.h[5]
387            umlal2      v15.4s, v13.8h, v1.h[5]
388    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
389            //ext         v13.16b, v9.16b, v10.16b, #4*2
390            umlal2      v14.4s, v6.8h, v1.h[4]
391            umlal       v15.4s, v7.4h, v1.h[4]
392            umlal2      v14.4s, v9.8h, v1.h[4]
393            umlal       v15.4s, v10.4h, v1.h[4]
394    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
395            ext         v13.16b, v9.16b, v10.16b, #3*2
396            umlal       v14.4s, v12.4h, v1.h[3]
397            umlal2      v15.4s, v12.8h, v1.h[3]
398            umlal       v14.4s, v13.4h, v1.h[3]
399            umlal2      v15.4s, v13.8h, v1.h[3]
400    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
401            ext         v13.16b, v9.16b, v10.16b, #2*2
402            umlal       v14.4s, v12.4h, v1.h[2]
403            umlal2      v15.4s, v12.8h, v1.h[2]
404            umlal       v14.4s, v13.4h, v1.h[2]
405            umlal2      v15.4s, v13.8h, v1.h[2]
406    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
407            ext         v13.16b, v9.16b, v10.16b, #1*2
408            umlal       v14.4s, v12.4h, v1.h[1]
409            umlal2      v15.4s, v12.8h, v1.h[1]
410            umlal       v14.4s, v13.4h, v1.h[1]
411            umlal2      v15.4s, v13.8h, v1.h[1]
412    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
413            //ext         v13.16b, v9.16b, v10.16b, #0*2
414            umlal       v14.4s, v7.4h, v1.h[0]
415            umlal2      v15.4s, v7.8h, v1.h[0]
416            umlal       v14.4s, v9.4h, v1.h[0]
417            umlal2      v15.4s, v9.8h, v1.h[0]
418    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
419            ext         v13.16b, v8.16b, v9.16b, #7*2
420            umlal       v14.4s, v12.4h, v0.h[7]
421            umlal2      v15.4s, v12.8h, v0.h[7]
422            umlal       v14.4s, v13.4h, v0.h[7]
423            umlal2      v15.4s, v13.8h, v0.h[7]
424    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
425            ext         v13.16b, v8.16b, v9.16b, #6*2
426            umlal       v14.4s, v12.4h, v0.h[6]
427            umlal2      v15.4s, v12.8h, v0.h[6]
428            umlal       v14.4s, v13.4h, v0.h[6]
429            umlal2      v15.4s, v13.8h, v0.h[6]
430    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
431            ext         v13.16b, v8.16b, v9.16b, #5*2
432            umlal       v14.4s, v12.4h, v0.h[5]
433            umlal2      v15.4s, v12.8h, v0.h[5]
434            umlal       v14.4s, v13.4h, v0.h[5]
435            umlal2      v15.4s, v13.8h, v0.h[5]
436    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
437            //ext         v13.16b, v8.16b, v9.16b, #4*2
438            umlal2      v14.4s, v7.8h, v0.h[4]
439            umlal       v15.4s, v8.4h, v0.h[4]
440            umlal2      v14.4s, v8.8h, v0.h[4]
441            umlal       v15.4s, v9.4h, v0.h[4]
442    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
443            ext         v13.16b, v8.16b, v9.16b, #3*2
444            umlal       v14.4s, v12.4h, v0.h[3]
445            umlal2      v15.4s, v12.8h, v0.h[3]
446            umlal       v14.4s, v13.4h, v0.h[3]
447            umlal2      v15.4s, v13.8h, v0.h[3]
448    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
449            ext         v13.16b, v8.16b, v9.16b, #2*2
450            umlal       v14.4s, v12.4h, v0.h[2]
451            umlal2      v15.4s, v12.8h, v0.h[2]
452            umlal       v14.4s, v13.4h, v0.h[2]
453            umlal2      v15.4s, v13.8h, v0.h[2]
454    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
455            ext         v13.16b, v8.16b, v9.16b, #1*2
456            umlal       v14.4s, v12.4h, v0.h[1]
457            umlal2      v15.4s, v12.8h, v0.h[1]
458            umlal       v14.4s, v13.4h, v0.h[1]
459            umlal2      v15.4s, v13.8h, v0.h[1]
460
461            uqrshrn     v14.4h, v14.4s, #16
462            uqrshrn2    v14.8h, v15.4s, #16
463            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
464
465            mov         v6.16b, v7.16b
466            mov         v7.16b, v8.16b
467            mov         v8.16b, v9.16b
468            mov         v9.16b, v10.16b
469            mov         v10.16b, v11.16b
470.endm/*}}}*/
471
472.macro hconv1_25/*{{{*/
473            ext         v12.16b, v6.16b, v7.16b, #7*2
474            umull       v14.4s, v12.4h, v0.h[0]
475            umull2      v15.4s, v12.8h, v0.h[0]
476
477            adr         x16, 100f
478            ldrsh       x12, [x16, x5, LSL #1]
479            add         x12, x12, x16
480            br          x12
481   100:     .hword -4
482            .hword 101f-100b
483            .hword 102f-100b
484            .hword 103f-100b
485            .hword 104f-100b
486            .hword 105f-100b
487            .hword 106f-100b
488            .hword 107f-100b
489            .hword 108f-100b
490            .hword 109f-100b
491            .hword 110f-100b
492            .hword 111f-100b
493            .hword 112f-100b
494            .hword 113f-100b
495            .hword 114f-100b
496            .hword 115f-100b
497            .hword 116f-100b
498            .hword 117f-100b
499            .hword 118f-100b
500            .hword 119f-100b
501            .hword 120f-100b
502            .hword 121f-100b
503            .hword 122f-100b
504            .hword 123f-100b
505            .hword 124f-100b
506            .hword 125f-100b
507            .align 4
508    125:    ext         v12.16b, v31.16b, v4.16b, #6*2
509            ext         v13.16b, v10.16b, v11.16b, #0*2
510            umlal       v14.4s, v12.4h, v3.h[1]
511            umlal2      v15.4s, v12.8h, v3.h[1]
512            umlal       v14.4s, v13.4h, v3.h[1]
513            umlal2      v15.4s, v13.8h, v3.h[1]
514    124:    ext         v12.16b, v31.16b, v4.16b, #7*2
515            ext         v13.16b, v9.16b, v10.16b, #7*2
516            umlal       v14.4s, v12.4h, v3.h[0]
517            umlal2      v15.4s, v12.8h, v3.h[0]
518            umlal       v14.4s, v13.4h, v3.h[0]
519            umlal2      v15.4s, v13.8h, v3.h[0]
520    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
521            ext         v13.16b, v9.16b, v10.16b, #6*2
522            umlal       v14.4s, v12.4h, v2.h[7]
523            umlal2      v15.4s, v12.8h, v2.h[7]
524            umlal       v14.4s, v13.4h, v2.h[7]
525            umlal2      v15.4s, v13.8h, v2.h[7]
526    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
527            ext         v13.16b, v9.16b, v10.16b, #5*2
528            umlal       v14.4s, v12.4h, v2.h[6]
529            umlal2      v15.4s, v12.8h, v2.h[6]
530            umlal       v14.4s, v13.4h, v2.h[6]
531            umlal2      v15.4s, v13.8h, v2.h[6]
532    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
533            ext         v13.16b, v9.16b, v10.16b, #4*2
534            umlal       v14.4s, v12.4h, v2.h[5]
535            umlal2      v15.4s, v12.8h, v2.h[5]
536            umlal       v14.4s, v13.4h, v2.h[5]
537            umlal2      v15.4s, v13.8h, v2.h[5]
538    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
539            ext         v13.16b, v9.16b, v10.16b, #3*2
540            umlal       v14.4s, v12.4h, v2.h[4]
541            umlal2      v15.4s, v12.8h, v2.h[4]
542            umlal       v14.4s, v13.4h, v2.h[4]
543            umlal2      v15.4s, v13.8h, v2.h[4]
544    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
545            ext         v13.16b, v9.16b, v10.16b, #2*2
546            umlal       v14.4s, v12.4h, v2.h[3]
547            umlal2      v15.4s, v12.8h, v2.h[3]
548            umlal       v14.4s, v13.4h, v2.h[3]
549            umlal2      v15.4s, v13.8h, v2.h[3]
550    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
551            ext         v13.16b, v9.16b, v10.16b, #1*2
552            umlal       v14.4s, v12.4h, v2.h[2]
553            umlal2      v15.4s, v12.8h, v2.h[2]
554            umlal       v14.4s, v13.4h, v2.h[2]
555            umlal2      v15.4s, v13.8h, v2.h[2]
556    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
557            ext         v13.16b, v9.16b, v10.16b, #0*2
558            umlal       v14.4s, v12.4h, v2.h[1]
559            umlal2      v15.4s, v12.8h, v2.h[1]
560            umlal       v14.4s, v13.4h, v2.h[1]
561            umlal2      v15.4s, v13.8h, v2.h[1]
562    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
563            ext         v13.16b, v8.16b, v9.16b, #7*2
564            umlal       v14.4s, v12.4h, v2.h[0]
565            umlal2      v15.4s, v12.8h, v2.h[0]
566            umlal       v14.4s, v13.4h, v2.h[0]
567            umlal2      v15.4s, v13.8h, v2.h[0]
568    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
569            ext         v13.16b, v8.16b, v9.16b, #6*2
570            umlal       v14.4s, v12.4h, v1.h[7]
571            umlal2      v15.4s, v12.8h, v1.h[7]
572            umlal       v14.4s, v13.4h, v1.h[7]
573            umlal2      v15.4s, v13.8h, v1.h[7]
574    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
575            ext         v13.16b, v8.16b, v9.16b, #5*2
576            umlal       v14.4s, v12.4h, v1.h[6]
577            umlal2      v15.4s, v12.8h, v1.h[6]
578            umlal       v14.4s, v13.4h, v1.h[6]
579            umlal2      v15.4s, v13.8h, v1.h[6]
580    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
581            ext         v13.16b, v8.16b, v9.16b, #4*2
582            umlal       v14.4s, v12.4h, v1.h[5]
583            umlal2      v15.4s, v12.8h, v1.h[5]
584            umlal       v14.4s, v13.4h, v1.h[5]
585            umlal2      v15.4s, v13.8h, v1.h[5]
586    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
587            ext         v13.16b, v8.16b, v9.16b, #3*2
588            umlal       v14.4s, v12.4h, v1.h[4]
589            umlal2      v15.4s, v12.8h, v1.h[4]
590            umlal       v14.4s, v13.4h, v1.h[4]
591            umlal2      v15.4s, v13.8h, v1.h[4]
592    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
593            ext         v13.16b, v8.16b, v9.16b, #2*2
594            umlal       v14.4s, v12.4h, v1.h[3]
595            umlal2      v15.4s, v12.8h, v1.h[3]
596            umlal       v14.4s, v13.4h, v1.h[3]
597            umlal2      v15.4s, v13.8h, v1.h[3]
598    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
599            ext         v13.16b, v8.16b, v9.16b, #1*2
600            umlal       v14.4s, v12.4h, v1.h[2]
601            umlal2      v15.4s, v12.8h, v1.h[2]
602            umlal       v14.4s, v13.4h, v1.h[2]
603            umlal2      v15.4s, v13.8h, v1.h[2]
604    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
605            ext         v13.16b, v8.16b, v9.16b, #0*2
606            umlal       v14.4s, v12.4h, v1.h[1]
607            umlal2      v15.4s, v12.8h, v1.h[1]
608            umlal       v14.4s, v13.4h, v1.h[1]
609            umlal2      v15.4s, v13.8h, v1.h[1]
610    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
611            ext         v13.16b, v7.16b, v8.16b, #7*2
612            umlal       v14.4s, v12.4h, v1.h[0]
613            umlal2      v15.4s, v12.8h, v1.h[0]
614            umlal       v14.4s, v13.4h, v1.h[0]
615            umlal2      v15.4s, v13.8h, v1.h[0]
616    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
617            ext         v13.16b, v7.16b, v8.16b, #6*2
618            umlal       v14.4s, v12.4h, v0.h[7]
619            umlal2      v15.4s, v12.8h, v0.h[7]
620            umlal       v14.4s, v13.4h, v0.h[7]
621            umlal2      v15.4s, v13.8h, v0.h[7]
622    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
623            ext         v13.16b, v7.16b, v8.16b, #5*2
624            umlal       v14.4s, v12.4h, v0.h[6]
625            umlal2      v15.4s, v12.8h, v0.h[6]
626            umlal       v14.4s, v13.4h, v0.h[6]
627            umlal2      v15.4s, v13.8h, v0.h[6]
628    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
629            ext         v13.16b, v7.16b, v8.16b, #4*2
630            umlal       v14.4s, v12.4h, v0.h[5]
631            umlal2      v15.4s, v12.8h, v0.h[5]
632            umlal       v14.4s, v13.4h, v0.h[5]
633            umlal2      v15.4s, v13.8h, v0.h[5]
634    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
635            ext         v13.16b, v7.16b, v8.16b, #3*2
636            umlal       v14.4s, v12.4h, v0.h[4]
637            umlal2      v15.4s, v12.8h, v0.h[4]
638            umlal       v14.4s, v13.4h, v0.h[4]
639            umlal2      v15.4s, v13.8h, v0.h[4]
640    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
641            ext         v13.16b, v7.16b, v8.16b, #2*2
642            umlal       v14.4s, v12.4h, v0.h[3]
643            umlal2      v15.4s, v12.8h, v0.h[3]
644            umlal       v14.4s, v13.4h, v0.h[3]
645            umlal2      v15.4s, v13.8h, v0.h[3]
646    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
647            ext         v13.16b, v7.16b, v8.16b, #1*2
648            umlal       v14.4s, v12.4h, v0.h[2]
649            umlal2      v15.4s, v12.8h, v0.h[2]
650            umlal       v14.4s, v13.4h, v0.h[2]
651            umlal2      v15.4s, v13.8h, v0.h[2]
652    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
653            ext         v13.16b, v7.16b, v8.16b, #0*2
654            umlal       v14.4s, v12.4h, v0.h[1]
655            umlal2      v15.4s, v12.8h, v0.h[1]
656            umlal       v14.4s, v13.4h, v0.h[1]
657            umlal2      v15.4s, v13.8h, v0.h[1]
658
659            uqrshrn     v14.4h, v14.4s, #16
660            uqrshrn2    v14.8h, v15.4s, #16
661            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
662
663            mov         v31.16b, v4.16b
664            mov         v4.16b, v5.16b
665            mov         v5.16b, v6.16b
666            mov         v6.16b, v7.16b
667            mov         v7.16b, v8.16b
668            mov         v8.16b, v9.16b
669            mov         v9.16b, v10.16b
670            mov         v10.16b, v11.16b
671.endm/*}}}*/
672
673#define TUNED_LIST4 6, 12, 20
674.macro hconv4_6/*{{{*/
675            umull       v14.4s, v7.4h, v0.h[0]
676            umull2      v15.4s, v7.8h, v0.h[0]
677
678            adr         x16, 100f
679            ldrsh       x12, [x16, x5, LSL #1]
680            add         x12, x12, x16
681            br          x12
682   100:     .hword -4
683            .hword 101f-100b
684            .hword 102f-100b
685            .hword 103f-100b
686            .hword 104f-100b
687            .hword 105f-100b
688            .hword 106f-100b
689            .align      4
690    106:    umlal       v14.4s, v4.4h,  v0.h[6]
691            umlal2      v15.4s, v4.8h,  v0.h[6]
692            umlal       v14.4s, v10.4h, v0.h[6]
693            umlal2      v15.4s, v10.8h, v0.h[6]
694    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
695            umlal       v15.4s, v5.4h, v0.h[5]
696            umlal2      v14.4s, v9.8h, v0.h[5]
697            umlal       v15.4s, v10.4h, v0.h[5]
698    104:    umlal       v14.4s, v5.4h, v0.h[4]
699            umlal2      v15.4s, v5.8h, v0.h[4]
700            umlal       v14.4s, v9.4h, v0.h[4]
701            umlal2      v15.4s, v9.8h, v0.h[4]
702    103:    umlal2      v14.4s, v5.8h, v0.h[3]
703            umlal       v15.4s, v6.4h, v0.h[3]
704            umlal2      v14.4s, v8.8h, v0.h[3]
705            umlal       v15.4s, v9.4h, v0.h[3]
706    102:    umlal       v14.4s, v6.4h, v0.h[2]
707            umlal2      v15.4s, v6.8h, v0.h[2]
708            umlal       v14.4s, v8.4h, v0.h[2]
709            umlal2      v15.4s, v8.8h, v0.h[2]
710    101:    umlal2      v14.4s, v6.8h, v0.h[1]
711            umlal       v15.4s, v7.4h, v0.h[1]
712            umlal2      v14.4s, v7.8h, v0.h[1]
713            umlal       v15.4s, v8.4h, v0.h[1]
714
715            uqrshrn     v14.4h, v14.4s, #16
716            uqrshrn2    v14.8h, v15.4s, #16
717            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
718
719            mov         v4.16b, v5.16b
720            mov         v5.16b, v6.16b
721            mov         v6.16b, v7.16b
722            mov         v7.16b, v8.16b
723            mov         v8.16b, v9.16b
724            mov         v9.16b, v10.16b
725            mov         v10.16b, v11.16b
726.endm/*}}}*/
727
728.macro hconv4_12/*{{{*/
729            umull       v14.4s, v4.4h, v0.h[0]
730            umull2      v15.4s, v4.8h, v0.h[0]
731
732            adr         x16, 100f
733            ldrsh       x12, [x16, x5, LSL #1]
734            add         x12, x12, x16
735            br          x12
736   100:     .hword -4
737            .hword 101f-100b
738            .hword 102f-100b
739            .hword 103f-100b
740            .hword 104f-100b
741            .hword 105f-100b
742            .hword 106f-100b
743            .hword 107f-100b
744            .hword 108f-100b
745            .hword 109f-100b
746            .hword 110f-100b
747            .hword 111f-100b
748            .hword 112f-100b
749            .align 4
750    112:    umlal       v14.4s, v26.4h, v1.h[4]
751            umlal2      v15.4s, v26.8h, v1.h[4]
752            umlal       v14.4s, v10.4h, v1.h[4]
753            umlal2      v15.4s, v10.8h, v1.h[4]
754    111:    umlal2      v14.4s, v26.8h, v1.h[3]
755            umlal       v15.4s, v27.4h, v1.h[3]
756            umlal2      v14.4s, v9.8h, v1.h[3]
757            umlal       v15.4s, v10.4h, v1.h[3]
758    110:    umlal       v14.4s, v27.4h, v1.h[2]
759            umlal2      v15.4s, v27.8h, v1.h[2]
760            umlal       v14.4s, v9.4h, v1.h[2]
761            umlal2      v15.4s, v9.8h, v1.h[2]
762    109:    umlal2      v14.4s, v27.8h, v1.h[1]
763            umlal       v15.4s, v28.4h, v1.h[1]
764            umlal2      v14.4s, v8.8h, v1.h[1]
765            umlal       v15.4s, v9.4h, v1.h[1]
766    108:    umlal       v14.4s, v28.4h, v1.h[0]
767            umlal2      v15.4s, v28.8h, v1.h[0]
768            umlal       v14.4s, v8.4h, v1.h[0]
769            umlal2      v15.4s, v8.8h, v1.h[0]
770    107:    umlal2      v14.4s, v28.8h, v0.h[7]
771            umlal       v15.4s, v29.4h, v0.h[7]
772            umlal2      v14.4s, v7.8h, v0.h[7]
773            umlal       v15.4s, v8.4h, v0.h[7]
774    106:    umlal       v14.4s, v29.4h, v0.h[6]
775            umlal2      v15.4s, v29.8h, v0.h[6]
776            umlal       v14.4s, v7.4h, v0.h[6]
777            umlal2      v15.4s, v7.8h, v0.h[6]
778    105:    umlal2      v14.4s, v29.8h, v0.h[5]
779            umlal       v15.4s, v30.4h, v0.h[5]
780            umlal2      v14.4s, v6.8h, v0.h[5]
781            umlal       v15.4s, v7.4h, v0.h[5]
782    104:    umlal       v14.4s, v30.4h, v0.h[4]
783            umlal2      v15.4s, v30.8h, v0.h[4]
784            umlal       v14.4s, v6.4h, v0.h[4]
785            umlal2      v15.4s, v6.8h, v0.h[4]
786    103:    umlal2      v14.4s, v30.8h, v0.h[3]
787            umlal       v15.4s, v31.4h, v0.h[3]
788            umlal2      v14.4s, v5.8h, v0.h[3]
789            umlal       v15.4s, v6.4h, v0.h[3]
790    102:    umlal       v14.4s, v31.4h, v0.h[2]
791            umlal2      v15.4s, v31.8h, v0.h[2]
792            umlal       v14.4s, v5.4h, v0.h[2]
793            umlal2      v15.4s, v5.8h, v0.h[2]
794    101:    umlal2      v14.4s, v31.8h, v0.h[1]
795            umlal       v15.4s, v4.4h,  v0.h[1]
796            umlal2      v14.4s, v4.8h,  v0.h[1]
797            umlal       v15.4s, v5.4h, v0.h[1]
798
799            uqrshrn     v14.4h, v14.4s, #16
800            uqrshrn2    v14.8h, v15.4s, #16
801            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
802
803            mov         v26.16b, v27.16b
804            mov         v27.16b, v28.16b
805            mov         v28.16b, v29.16b
806            mov         v29.16b, v30.16b
807            mov         v30.16b, v31.16b
808            mov         v31.16b, v4.16b
809            mov         v4.16b, v5.16b
810            mov         v5.16b, v6.16b
811            mov         v6.16b, v7.16b
812            mov         v7.16b, v8.16b
813            mov         v8.16b, v9.16b
814            mov         v9.16b, v10.16b
815            mov         v10.16b, v11.16b
816.endm/*}}}*/
817
818.macro hconv4_20/*{{{*/
819            umull       v14.4s, v28.4h, v0.h[0]
820            umull2      v15.4s, v28.8h, v0.h[0]
821
822            adr         x16, 100f
823            ldrsh       x12, [x16, x5, LSL #1]
824            add         x12, x12, x16
825            br          x12
826   100:     .hword -4
827            .hword 101f-100b
828            .hword 102f-100b
829            .hword 103f-100b
830            .hword 104f-100b
831            .hword 105f-100b
832            .hword 106f-100b
833            .hword 107f-100b
834            .hword 108f-100b
835            .hword 109f-100b
836            .hword 110f-100b
837            .hword 111f-100b
838            .hword 112f-100b
839            .hword 113f-100b
840            .hword 114f-100b
841            .hword 115f-100b
842            .hword 116f-100b
843            .hword 117f-100b
844            .hword 118f-100b
845            .hword 119f-100b
846            .hword 120f-100b
847            .align 4
848
849    120:    umlal       v14.4s, v18.4h, v2.h[4]
850            umlal2      v15.4s, v18.8h, v2.h[4]
851            umlal       v14.4s, v10.4h, v2.h[4]
852            umlal2      v15.4s, v10.8h, v2.h[4]
853    119:    umlal2      v14.4s, v18.8h, v2.h[3]
854            umlal       v15.4s, v19.4h, v2.h[3]
855            umlal2      v14.4s, v9.8h,  v2.h[3]
856            umlal       v15.4s, v10.4h, v2.h[3]
857    118:    umlal       v14.4s, v19.4h, v2.h[2]
858            umlal2      v15.4s, v19.8h, v2.h[2]
859            umlal       v14.4s, v9.4h,  v2.h[2]
860            umlal2      v15.4s, v9.8h,  v2.h[2]
861    117:    umlal2      v14.4s, v19.8h, v2.h[1]
862            umlal       v15.4s, v20.4h, v2.h[1]
863            umlal2      v14.4s, v8.8h,  v2.h[1]
864            umlal       v15.4s, v9.4h,  v2.h[1]
865    116:    umlal       v14.4s, v20.4h, v2.h[0]
866            umlal2      v15.4s, v20.8h, v2.h[0]
867            umlal       v14.4s, v8.4h,  v2.h[0]
868            umlal2      v15.4s, v8.8h,  v2.h[0]
869    115:    umlal2      v14.4s, v20.8h, v1.h[7]
870            umlal       v15.4s, v21.4h, v1.h[7]
871            umlal2      v14.4s, v7.8h,  v1.h[7]
872            umlal       v15.4s, v8.4h,  v1.h[7]
873    114:    umlal       v14.4s, v21.4h, v1.h[6]
874            umlal2      v15.4s, v21.8h, v1.h[6]
875            umlal       v14.4s, v7.4h,  v1.h[6]
876            umlal2      v15.4s, v7.8h,  v1.h[6]
877    113:    umlal2      v14.4s, v21.8h, v1.h[5]
878            umlal       v15.4s, v22.4h, v1.h[5]
879            umlal2      v14.4s, v6.8h,  v1.h[5]
880            umlal       v15.4s, v7.4h,  v1.h[5]
881    112:    umlal       v14.4s, v22.4h, v1.h[4]
882            umlal2      v15.4s, v22.8h, v1.h[4]
883            umlal       v14.4s, v6.4h,  v1.h[4]
884            umlal2      v15.4s, v6.8h,  v1.h[4]
885    111:    umlal2      v14.4s, v22.8h, v1.h[3]
886            umlal       v15.4s, v23.4h, v1.h[3]
887            umlal2      v14.4s, v5.8h,  v1.h[3]
888            umlal       v15.4s, v6.4h,  v1.h[3]
889    110:    umlal       v14.4s, v23.4h, v1.h[2]
890            umlal2      v15.4s, v23.8h, v1.h[2]
891            umlal       v14.4s, v5.4h,  v1.h[2]
892            umlal2      v15.4s, v5.8h,  v1.h[2]
893    109:    umlal2      v14.4s, v23.8h, v1.h[1]
894            umlal       v15.4s, v24.4h, v1.h[1]
895            umlal2      v14.4s, v4.8h,  v1.h[1]
896            umlal       v15.4s, v5.4h,  v1.h[1]
897    108:    umlal       v14.4s, v24.4h, v1.h[0]
898            umlal2      v15.4s, v24.8h, v1.h[0]
899            umlal       v14.4s, v4.4h,  v1.h[0]
900            umlal2      v15.4s, v4.8h,  v1.h[0]
901    107:    umlal2      v14.4s, v24.8h, v0.h[7]
902            umlal       v15.4s, v25.4h, v0.h[7]
903            umlal2      v14.4s, v31.8h, v0.h[7]
904            umlal       v15.4s, v4.4h,  v0.h[7]
905    106:    umlal       v14.4s, v25.4h, v0.h[6]
906            umlal2      v15.4s, v25.8h, v0.h[6]
907            umlal       v14.4s, v31.4h, v0.h[6]
908            umlal2      v15.4s, v31.8h, v0.h[6]
909    105:    umlal2      v14.4s, v25.8h, v0.h[5]
910            umlal       v15.4s, v26.4h, v0.h[5]
911            umlal2      v14.4s, v30.8h, v0.h[5]
912            umlal       v15.4s, v31.4h, v0.h[5]
913    104:    umlal       v14.4s, v26.4h, v0.h[4]
914            umlal2      v15.4s, v26.8h, v0.h[4]
915            umlal       v14.4s, v30.4h, v0.h[4]
916            umlal2      v15.4s, v30.8h, v0.h[4]
917    103:    umlal2      v14.4s, v26.8h, v0.h[3]
918            umlal       v15.4s, v27.4h, v0.h[3]
919            umlal2      v14.4s, v29.8h, v0.h[3]
920            umlal       v15.4s, v30.4h, v0.h[3]
921    102:    umlal       v14.4s, v27.4h, v0.h[2]
922            umlal2      v15.4s, v27.8h, v0.h[2]
923            umlal       v14.4s, v29.4h, v0.h[2]
924            umlal2      v15.4s, v29.8h, v0.h[2]
925    101:    umlal2      v14.4s, v27.8h, v0.h[1]
926            umlal       v15.4s, v28.4h, v0.h[1]
927            umlal2      v14.4s, v28.8h, v0.h[1]
928            umlal       v15.4s, v29.4h, v0.h[1]
929
930            uqrshrn     v14.4h, v14.4s, #16
931            uqrshrn2    v14.8h, v15.4s, #16
932            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
933
934            mov         v18.16b, v19.16b
935            mov         v19.16b, v20.16b
936            mov         v20.16b, v21.16b
937            mov         v21.16b, v22.16b
938            mov         v22.16b, v23.16b
939            mov         v23.16b, v24.16b
940            mov         v24.16b, v25.16b
941            mov         v25.16b, v26.16b
942            mov         v26.16b, v27.16b
943            mov         v27.16b, v28.16b
944            mov         v28.16b, v29.16b
945            mov         v29.16b, v30.16b
946            mov         v30.16b, v31.16b
947            mov         v31.16b, v4.16b
948            mov         v4.16b, v5.16b
949            mov         v5.16b, v6.16b
950            mov         v6.16b, v7.16b
951            mov         v7.16b, v8.16b
952            mov         v8.16b, v9.16b
953            mov         v9.16b, v10.16b
954            mov         v10.16b, v11.16b
955.endm/*}}}*/
956
957.macro hconv4_25/*{{{*/
958            umull2      v14.4s, v25.8h, v0.h[0]
959            umull       v15.4s, v26.4h, v0.h[0]
960
961            adr         x16, 100f
962            ldrsh       x12, [x16, x5, LSL #1]
963            add         x12, x12, x16
964            br          x12
965   100:     .hword -4
966            .hword 101f-100b
967            .hword 102f-100b
968            .hword 103f-100b
969            .hword 104f-100b
970            .hword 105f-100b
971            .hword 106f-100b
972            .hword 107f-100b
973            .hword 108f-100b
974            .hword 109f-100b
975            .hword 110f-100b
976            .hword 111f-100b
977            .hword 112f-100b
978            .hword 113f-100b
979            .hword 114f-100b
980            .hword 115f-100b
981            .hword 116f-100b
982            .hword 117f-100b
983            .hword 118f-100b
984            .hword 119f-100b
985            .hword 120f-100b
986            .hword 121f-100b
987            .hword 122f-100b
988            .hword 123f-100b
989            .hword 124f-100b
990            .hword 125f-100b
991            .align 4
992
993    125:    ld1         {v12.8h}, [x9]
994            umlal       v14.4s, v12.4h, v3.h[1]
995            umlal2      v15.4s, v12.8h, v3.h[1]
996            umlal       v14.4s, v10.4h, v3.h[1]
997            umlal2      v15.4s, v10.8h, v3.h[1]
998    124:    add         x12, x9, #0x08
999            bic         x12, x12, #0x40
1000            ld1         {v12.4h}, [x12], #8
1001            bic         x12, x12, #0x40
1002            ld1         {v13.4h}, [x12]
1003            umlal       v14.4s, v12.4h, v3.h[0]
1004            umlal       v15.4s, v13.4h, v3.h[0]
1005            umlal2      v14.4s, v9.8h,  v3.h[0]
1006            umlal       v15.4s, v10.4h, v3.h[0]
1007    123:    add         x12, x9, #0x10
1008            bic         x12, x12, #0x40
1009            ld1         {v12.8h}, [x12]
1010            umlal       v14.4s, v12.4h, v2.h[7]
1011            umlal2      v15.4s, v12.8h, v2.h[7]
1012            umlal       v14.4s, v9.4h,  v2.h[7]
1013            umlal2      v15.4s, v9.8h,  v2.h[7]
1014    122:    add         x12, x9, #0x18
1015            bic         x12, x12, #0x40
1016            ld1         {v12.4h}, [x12], #8
1017            bic         x12, x12, #0x40
1018            ld1         {v13.4h}, [x12]
1019            umlal       v14.4s, v12.4h, v2.h[6]
1020            umlal       v15.4s, v13.4h, v2.h[6]
1021            umlal2      v14.4s, v8.8h,  v2.h[6]
1022            umlal       v15.4s, v9.4h,  v2.h[6]
1023    121:    add         x12, x9, #0x20
1024            bic         x12, x12, #0x40
1025            ld1         {v12.8h}, [x12]
1026            umlal       v14.4s, v12.4h, v2.h[5]
1027            umlal2      v15.4s, v12.8h, v2.h[5]
1028            umlal       v14.4s, v8.4h,  v2.h[5]
1029            umlal2      v15.4s, v8.8h,  v2.h[5]
1030    120:    add         x12, x9, #0x28
1031            bic         x12, x12, #0x40
1032            ld1         {v12.4h}, [x12], #8
1033            bic         x12, x12, #0x40
1034            ld1         {v13.4h}, [x12]
1035            umlal       v14.4s, v12.4h, v2.h[4]
1036            umlal       v15.4s, v13.4h, v2.h[4]
1037            umlal2      v14.4s, v7.8h,  v2.h[4]
1038            umlal       v15.4s, v8.4h,  v2.h[4]
1039    119:    add         x12, x9, #0x30
1040            bic         x12, x12, #0x40
1041            ld1         {v12.8h}, [x12]
1042            umlal       v14.4s, v12.4h, v2.h[3]
1043            umlal2      v15.4s, v12.8h, v2.h[3]
1044            umlal       v14.4s, v7.4h,  v2.h[3]
1045            umlal2      v15.4s, v7.8h,  v2.h[3]
1046    118:    add         x12, x9, #0x38
1047            bic         x12, x12, #0x40
1048            ld1         {v12.4h}, [x12]
1049            umlal       v14.4s, v12.4h, v2.h[2]
1050            umlal       v15.4s, v17.4h, v2.h[2]
1051            umlal2      v14.4s, v6.8h,  v2.h[2]
1052            umlal       v15.4s, v7.4h,  v2.h[2]
1053    117:    umlal       v14.4s, v17.4h, v2.h[1]
1054            umlal2      v15.4s, v17.8h, v2.h[1]
1055            umlal       v14.4s, v6.4h,  v2.h[1]
1056            umlal2      v15.4s, v6.8h,  v2.h[1]
1057    116:    umlal2      v14.4s, v17.8h, v2.h[0]
1058            umlal       v15.4s, v18.4h, v2.h[0]
1059            umlal2      v14.4s, v5.8h,  v2.h[0]
1060            umlal       v15.4s, v6.4h,  v2.h[0]
1061    115:    umlal       v14.4s, v18.4h, v1.h[7]
1062            umlal2      v15.4s, v18.8h, v1.h[7]
1063            umlal       v14.4s, v5.4h,  v1.h[7]
1064            umlal2      v15.4s, v5.8h,  v1.h[7]
1065    114:    umlal2      v14.4s, v18.8h, v1.h[6]
1066            umlal       v15.4s, v19.4h, v1.h[6]
1067            umlal2      v14.4s, v4.8h,  v1.h[6]
1068            umlal       v15.4s, v5.4h,  v1.h[6]
1069    113:    umlal       v14.4s, v19.4h, v1.h[5]
1070            umlal2      v15.4s, v19.8h, v1.h[5]
1071            umlal       v14.4s, v4.4h,  v1.h[5]
1072            umlal2      v15.4s, v4.8h,  v1.h[5]
1073    112:    umlal2      v14.4s, v19.8h, v1.h[4]
1074            umlal       v15.4s, v20.4h, v1.h[4]
1075            umlal2      v14.4s, v31.8h, v1.h[4]
1076            umlal       v15.4s, v4.4h,  v1.h[4]
1077    111:    umlal       v14.4s, v20.4h, v1.h[3]
1078            umlal2      v15.4s, v20.8h, v1.h[3]
1079            umlal       v14.4s, v31.4h, v1.h[3]
1080            umlal2      v15.4s, v31.8h, v1.h[3]
1081    110:    umlal2      v14.4s, v20.8h, v1.h[2]
1082            umlal       v15.4s, v21.4h, v1.h[2]
1083            umlal2      v14.4s, v30.8h, v1.h[2]
1084            umlal       v15.4s, v31.4h, v1.h[2]
1085    109:    umlal       v14.4s, v21.4h, v1.h[1]
1086            umlal2      v15.4s, v21.8h, v1.h[1]
1087            umlal       v14.4s, v30.4h, v1.h[1]
1088            umlal2      v15.4s, v30.8h, v1.h[1]
1089    108:    umlal2      v14.4s, v21.8h, v1.h[0]
1090            umlal       v15.4s, v22.4h, v1.h[0]
1091            umlal2      v14.4s, v29.8h, v1.h[0]
1092            umlal       v15.4s, v30.4h, v1.h[0]
1093    107:    umlal       v14.4s, v22.4h, v0.h[7]
1094            umlal2      v15.4s, v22.8h, v0.h[7]
1095            umlal       v14.4s, v29.4h, v0.h[7]
1096            umlal2      v15.4s, v29.8h, v0.h[7]
1097    106:    umlal2      v14.4s, v22.8h, v0.h[6]
1098            umlal       v15.4s, v23.4h, v0.h[6]
1099            umlal2      v14.4s, v28.8h, v0.h[6]
1100            umlal       v15.4s, v29.4h, v0.h[6]
1101    105:    umlal       v14.4s, v23.4h, v0.h[5]
1102            umlal2      v15.4s, v23.8h, v0.h[5]
1103            umlal       v14.4s, v28.4h, v0.h[5]
1104            umlal2      v15.4s, v28.8h, v0.h[5]
1105    104:    umlal2      v14.4s, v23.8h, v0.h[4]
1106            umlal       v15.4s, v24.4h, v0.h[4]
1107            umlal2      v14.4s, v27.8h, v0.h[4]
1108            umlal       v15.4s, v28.4h, v0.h[4]
1109    103:    umlal       v14.4s, v24.4h, v0.h[3]
1110            umlal2      v15.4s, v24.8h, v0.h[3]
1111            umlal       v14.4s, v27.4h, v0.h[3]
1112            umlal2      v15.4s, v27.8h, v0.h[3]
1113    102:    umlal2      v14.4s, v24.8h, v0.h[2]
1114            umlal       v15.4s, v25.4h, v0.h[2]
1115            umlal2      v14.4s, v26.8h, v0.h[2]
1116            umlal       v15.4s, v27.4h, v0.h[2]
1117    101:    umlal       v14.4s, v25.4h, v0.h[1]
1118            umlal2      v15.4s, v25.8h, v0.h[1]
1119            umlal       v14.4s, v26.4h, v0.h[1]
1120            umlal2      v15.4s, v26.8h, v0.h[1]
1121
1122            uqrshrn     v14.4h, v14.4s, #16
1123            uqrshrn2    v14.8h, v15.4s, #16
1124            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
1125
1126            st1         {v17.16b}, [x9], #16
1127            bic         x9, x9, #0x40
1128            mov         v17.16b, v18.16b
1129            mov         v18.16b, v19.16b
1130            mov         v19.16b, v20.16b
1131            mov         v20.16b, v21.16b
1132            mov         v21.16b, v22.16b
1133            mov         v22.16b, v23.16b
1134            mov         v23.16b, v24.16b
1135            mov         v24.16b, v25.16b
1136            mov         v25.16b, v26.16b
1137            mov         v26.16b, v27.16b
1138            mov         v27.16b, v28.16b
1139            mov         v28.16b, v29.16b
1140            mov         v29.16b, v30.16b
1141            mov         v30.16b, v31.16b
1142            mov         v31.16b, v4.16b
1143            mov         v4.16b, v5.16b
1144            mov         v5.16b, v6.16b
1145            mov         v6.16b, v7.16b
1146            mov         v7.16b, v8.16b
1147            mov         v8.16b, v9.16b
1148            mov         v9.16b, v10.16b
1149            mov         v10.16b, v11.16b
1150.endm/*}}}*/
1151
1152/* Dedicated function wrapper for the fetch macro, for the cases where
1153 * performance isn't that important, to keep code size down.
1154 */
1155PRIVATE(fetch_generic_asm)
1156            stp         x10, x11, [sp, #-16]!
1157            fetch
1158            ldp         x10, x11, [sp], #16
1159            ret
1160END(fetch_generic_asm)
1161
1162
1163/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory
1164 * beyond that limit, and filling the rest of the vector with the last legal
1165 * pixel.
1166 * Result is in v10 and v11.  v8 and v9 are filled with the first legal pixel.
1167 * Note: This function can read beyond the right edge of input if the image is
1168 * narrower than 16 bytes.
1169 */
1170PRIVATE(fetch_clampleft1)
1171            stp         x29, x30, [sp, #-16]!
1172            bl          fetch_generic_asm
1173            dup         v8.8h, v10.h[0]
1174            dup         v9.8h, v10.h[0]
1175            ands        x12, x10, #15
1176            beq         1f
1177            sub         x1, x1, x12
1178            sub         x15, x15, x12
1179            sub         x19, x19, x12
1180            sub         x10, x10, x12
1181            sub         x12, sp, x12, LSL #1
1182            sub         sp, sp, #64
1183            sub         x12, x12, #32
1184            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
1185            ld1         {v10.8h,v11.8h}, [x12]
1186            add         sp, sp, #64
11871:          ldp         x29, x30, [sp], #16
1188            ret
1189END(fetch_clampleft1)
1190
1191PRIVATE(fetch_clampleft4)
1192            stp         x29, x30, [sp, #-16]!
1193            bl          fetch_generic_asm
1194            dup         v8.2d, v10.d[0]
1195            dup         v9.2d, v10.d[0]
1196            ands        x12, x10, #15
1197            beq         1f
1198            sub         x1, x1, x12
1199            sub         x15, x15, x12
1200            sub         x19, x19, x12
1201            sub         x10, x10, x12
1202            sub         x12, sp, x12, LSL #1
1203            sub         sp, sp, #64
1204            sub         x12, x12, #32
1205            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
1206            ld1         {v10.8h,v11.8h}, [x12]
1207            add         sp, sp, #64
12081:          ldp         x29, x30, [sp], #16
1209            ret
1210END(fetch_clampleft4)
1211
1212/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding
1213 * reading memory beyond that limit, and filling the rest of the vector with
1214 * the last legal pixel.
1215 * Result is in v10 and v11.  v12 and v13 are filled with the last legal pixel.
1216 * Note: This function can read beyond the left edge of input if the image is
1217 * narrower than 16 bytes.
1218 */
1219PRIVATE(fetch_clampright1)
1220            stp         x29, x30, [sp, #-16]!
1221            sub         x12, xzr, x11
1222            ands        x12, x12, #15
1223            beq         1f
1224            sub         x1, x1, x12
1225            sub         x15, x15, x12
1226            sub         x19, x19, x12
1227            bl          fetch_generic_asm
1228            dup         v12.8h, v11.h[7]
1229            dup         v13.8h, v11.h[7]
1230            sub         x12, xzr, x11
1231            and         x12, x12, #15
1232            sub         sp, sp, #64
1233            add         x12, sp, x12, LSL #1
1234            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
1235            ld1         {v10.8h,v11.8h}, [x12]
1236            add         sp, sp, #64
1237            ldp         x29, x30, [sp], #16
1238            ret
12391:          bl          fetch_generic_asm
1240            dup         v12.8h, v11.h[7]
1241            dup         v13.8h, v11.h[7]
1242            ldp         x29, x30, [sp], #16
1243            ret
1244END(fetch_clampright1)
1245
1246PRIVATE(fetch_clampright4)
1247            stp         x29, x30, [sp, #-16]!
1248            sub         x12, xzr, x11
1249            ands        x12, x12, #15
1250            beq         1f
1251            sub         x1, x1, x12
1252            sub         x15, x15, x12
1253            sub         x19, x19, x12
1254            bl          fetch_generic_asm
1255            dup         v12.2d, v11.d[1]
1256            dup         v13.2d, v11.d[1]
1257            sub         x12, xzr, x11
1258            and         x12, x12, #15
1259            sub         sp, sp, #64
1260            add         x12, sp, x12, LSL #1
1261            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
1262            ld1         {v10.8h,v11.8h}, [x12]
1263            add         sp, sp, #64
1264            ldp         x29, x30, [sp], #16
1265            ret
12661:          bl          fetch_generic_asm
1267            dup         v12.2d, v11.d[1]
1268            dup         v13.2d, v11.d[1]
1269            ldp         x29, x30, [sp], #16
1270            ret
1271END(fetch_clampright4)
1272
1273/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th
1274 * value across to fill the rest of the register pair.  Used for filling the
1275 * right hand edge of the window when reading too close to the right hand edge
1276 * of the image.
1277 * Also returns a dup-ed copy of the last element in v12 for the tail-fill
1278 * case (this happens incidentally in common path, but must be done
1279 * deliberately in the fast-out path).
1280 */
1281PRIVATE(prefill_sweepright1)
1282            ands        x12, x11, #15
1283            beq         1f
1284            sub         x12, x12, #1
1285            sub         sp, sp, #64
1286            st1         {v10.8h,v11.8h}, [sp]
1287            add         x12, sp, x12, LSL #1
1288            ld1r        {v12.8h}, [x12]
1289            ld1r        {v13.8h}, [x12]
1290            st1         {v12.8h,v13.8h}, [x12]
1291            ld1         {v10.8h,v11.8h}, [sp]
1292            add         sp, sp, #64
1293            ret
12941:          dup         v12.8h, v11.h[7]
1295            dup         v13.8h, v11.h[7]
1296            ret
1297END(prefill_sweepright1)
1298
1299PRIVATE(prefill_sweepright4)
1300            ands        x12, x11, #15
1301            beq         1f
1302            sub         x12, x12, #4
1303            sub         sp, sp, #64
1304            st1         {v10.8h,v11.8h}, [sp]
1305            add         x12, sp, x12, LSL #1
1306            ld1r        {v12.2d}, [x12]
1307            st1         {v13.8h}, [x12]
1308            ld1         {v10.8h,v11.8h}, [sp]
1309            add         sp, sp, #64
1310            ret
13111:          dup         v12.2d, v11.d[1]
1312            dup         v13.2d, v11.d[1]
1313            ret
1314END(prefill_sweepright4)
1315
1316/* The main loop keeps a sliding window of data that has already been convolved
1317 * in the vertical axis for the current line.  This usually stays in the
1318 * register file, but spills to memory for large windows.  The first thing that
1319 * needs to be done at start-up is to fill this window with image data, taking
1320 * into account the padding needed if the left or right edges of the image fall
1321 * within this window.
1322 */
1323
1324/* Because the window is in the register file writes to it cannot be indexed
1325 * by another register.  Consequently the fill loops are unrolled to address
1326 * the registers directly.  This macro distinguishes between writes to the
1327 * register file and writes to the spill buffer (indicated by a destination
1328 * register named xx).
1329 */
1330.macro prefill_out ra, rb, sra, srb
1331  .ifc \ra,xx
1332    .ifc \rb,xx
1333            st1         {\sra,\srb}, [x9], #32
1334    .else
1335            bic         x9, x9, #0x40
1336            st1         {\sra}, [x9], #16
1337            mov         \rb, \srb
1338    .endif
1339  .else
1340    .ifnc \ra,\sra
1341            mov         \ra, \sra
1342    .endif
1343    .ifnc \rb,\srb
1344            mov         \rb, \srb
1345    .endif
1346  .endif
1347.endm
1348
1349/* This macro provides the list of registers representing the window, and the
1350 * cases where the register file is too small and a spill buffer is used
1351 * instead.
1352 * Since several specialisations of each function are generated, this also
1353 * culls superfluous iterations, and sets the variable `i` for subsequent
1354 * macros indicating the current index into the window.
1355 */
1356.macro prefill_list, macro, nextmacro, max_r, step, label
1357  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
1358    .if windowsize >= (\line * 16)
1359      .set i, windowsize - (\line * 16)
1360\label\macro\line:
1361            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
1362    .endif
1363  .endm
1364            ifneeded \macro \nextmacro, 13, 12, xx,      xx,      \step, \label
1365            ifneeded \macro \nextmacro, 12, 11, xx,      xx,      \step, \label
1366            ifneeded \macro \nextmacro, 11, 10, xx,      v17.16b, \step, \label
1367            ifneeded \macro \nextmacro, 10,  9, v18.16b, v19.16b, \step, \label
1368            ifneeded \macro \nextmacro,  9,  8, v20.16b, v21.16b, \step, \label
1369            ifneeded \macro \nextmacro,  8,  7, v22.16b, v23.16b, \step, \label
1370            ifneeded \macro \nextmacro,  7,  6, v24.16b, v25.16b, \step, \label
1371            ifneeded \macro \nextmacro,  6,  5, v26.16b, v27.16b, \step, \label
1372            ifneeded \macro \nextmacro,  5,  4, v28.16b, v29.16b, \step, \label
1373            ifneeded \macro \nextmacro,  4,  3, v30.16b, v31.16b, \step, \label
1374            ifneeded \macro \nextmacro,  3,  2, v4.16b,  v5.16b,  \step, \label
1375            ifneeded \macro \nextmacro,  2,  1, v6.16b,  v7.16b,  \step, \label
1376            ifneeded \macro \nextmacro,  1,  0, v8.16b,  v9.16b,  \step, \label
1377\label\macro\()0:
1378            b           \label\()_end
1379  .purgem ifneeded
1380.endm
1381
1382/* These macros represent the possible stages of filling the window.
1383 * Each macro is unrolled enough times that it can fill the entire window
1384 * itself, but normally it will have to hand control to subsequent macros
1385 * part-way through and this is done using labels named \next and \after, where
1386 * \next is the next macro starting at the same window position and \after is
1387 * the next macro starting after the current window position.
1388 */
1389
1390/* leftfill: v8 and v9 contain the left padding value.  While the window
1391 * extends outside of the image on the left-hand side, and at least 16 more
1392 * padding values are needed in the window, store v8 and v9 into the window.
1393 * Otherwise skip forward to storing image data.
1394 */
1395.macro prefill_leftfill, next, after, ra, rb, step
1396            cmp         x10, #i+16
1397            blo         \next
1398            prefill_out \ra, \rb, v8.16b, v9.16b
1399.endm
1400
1401/* leftedge: The very first non-fill or partial-fill chunk from the image is
1402 * already loaded (as it was used to calculate the left padding value), so
1403 * store it here, and then drop into the regular load/store cycle in the next
1404 * macro.
1405 */
1406.macro prefill_leftedge, next, after, ra, rb, step
14071:          prefill_out \ra, \rb, v10.16b, v11.16b
1408            b           \after
1409.endm
1410
1411/* dofetch: Copy chunks of the image into the window without any complications
1412 * from edge conditions.
1413 */
1414.macro prefill_dofetch, next, after, ra, rb, step
1415            cmp         x11, #i+16
1416            bls         \next
1417            bl          fetch_generic_asm
1418            prefill_out \ra, \rb, v10.16b, v11.16b
1419.endm
1420
1421/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
1422 * the right-hand edge of the image.  In that case sweep the last valid pixel
1423 * across the rest of the chunk, and in either case prepare padding data in v12
1424 * and v13 for the next macro.  This is done in fetch_clampright.
1425 * This only happens once before going on to the next macro.
1426 * Sometimes leftedge also covers the rightedge case, in which case this has
1427 * to be skipped altogether.
1428 */
1429.macro prefill_rightedge, next, after, ra, rb, step
1430            cmp         x11, #i
1431            bls         \next
1432            bl          fetch_clampright\step
1433            prefill_out \ra, \rb, v10.16b, v11.16b
1434            b           \after
1435.endm
1436
1437/* rightfill: The rest of the window is simply filled with right padding from
1438 * v12 and v13.
1439 */
1440.macro prefill_rightfill, next, after, ra, rb, step
1441            prefill_out \ra, \rb, v12.16b, v13.16b
1442.endm
1443
1444/* Here all of the macros above are unrolled and laid out in the proper order.
1445 */
1446.macro prefill_body, max_r, step, label
1447            prefill_list leftfill,  leftedge,   \max_r, \step, \label
1448            prefill_list leftedge,  dofetch,    \max_r, \step, \label
1449            prefill_list dofetch,   rightedge,  \max_r, \step, \label
1450            prefill_list rightedge, rightfill,  \max_r, \step, \label
1451            prefill_list rightfill, oops,       \max_r, \step, \label
1452\label\()_end:
1453.endm
1454
1455
1456/* Fill the convolution window with context data.  The aim here is to load
1457 * exactly 2*r columns, and in the main loop to read as many columns as will be
1458 * written.  This is complicated by the window being divided into chunks at
1459 * register boundaries, and the need to handle cases when the input starts very
1460 * close to the left or right (or both) edges of the image and the need to fill
1461 * the spaces that leaves with left and right edge padding values.
1462 *
1463 * Input:
1464 *      x1 -- src
1465 *      x2 -- pitch
1466 *      x3 -- count
1467 *      x4 -- available image data right of src pointer
1468 *      x5 -- r
1469 *      x6 -- rup
1470 *      x7 -- rdn
1471 *      x8 -- available image data left of src pointer
1472 *      x9 -- buffer (if needed)
1473 *      x13 = -pitch
1474 *      x15 = top-row in
1475 *      x19 = bottom-row in
1476 * Output:
1477 *      x4 -= min(inlen, count + windowsize - centertap)
1478 *      x1 += min(inlen, count + windowsize - centertap)
1479 *      x15 += min(inlen, count + windowsize - centertap)
1480 *      x19 += min(inlen, count + windowsize - centertap)
1481 * Modifies:
1482 *      x10 -- fill start index in the window
1483 *      x11 -- fill stop index in the window
1484 *      x12 -- scratch
1485 */
1486.macro prefill step=1, max_r=25, label=xx
1487.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
1488.set centertap, (windowsize - \max_r * \step)
1489            mov         x10, #centertap
1490            subs        x10, x10, x8
1491            csel        x10, xzr, x10, lo
1492
1493            subs        x11, x4, #windowsize - centertap
1494            csel        x11, xzr, x11, hs
1495            add         x11, x11, #windowsize
1496
1497            /* x10 indicates where in the window legal image data begins.
1498             * x11 indicates where in the window legal image date ends.
1499             * When starting near the centre of a large image these would be
1500             * zero and windowsize respectively, but when starting near the
1501             * edges this can change.
1502             * When starting on the leftmost pixel, x10 will be centertap.
1503             * When starting on the rightmost pixel, x11 will be centertap+1.
1504             */
1505
1506            /* x4 indicates how much data there is between the current pointers
1507             * and the right edge of the image.  The pointers currently point
1508             * to the data needed at centertap.  The subsequent code will
1509             * consume (windowsize - x10) data, but only the data from
1510             * centertap to windowsize comes out of x4's budget.
1511             */
15121:          subs        x4, x4, #windowsize - centertap
1513            csel        x4, xzr, x4, lo
1514
1515            /* And the pointers need to rewind to the start of the window.
1516             */
1517            sub         x1, x1, #centertap
1518            sub         x15, x15, #centertap
1519            sub         x19, x19, #centertap
1520
1521            /* Unless x8 indicated that there wasn't that much data available.
1522             */
1523            add         x1, x1, x10
1524            add         x15, x15, x10
1525            add         x19, x19, x10
1526
1527            /* Get the first chunk, and add padding to align it to the window
1528             * if necessary.
1529             */
1530            bl          fetch_clampleft\step
1531
1532            /* Sometimes the start and the end of the window are in the same
1533             * chunk.  In that case both ends need filler at the outset.
1534             */
1535            sub         x12, x11, #1
1536            eor         x12,  x10, x12
1537            cmp         x12, #16
1538            bhs         1f
1539            bl          prefill_sweepright\step
1540
1541            /* Iterate through all the points in the window and fill them in
1542             * with padding or image data as needed.
1543             */
15441:          prefill_body \max_r, \step, \label
1545.endm
1546
1547/* The main body of the convolve functions.  Having already pre-filled the
1548 * convolution window with 2*r input values, the logic settles into a regular
1549 * pattern of reading and writing at a 1:1 rate until either input or output
1550 * expires.  The input leads the output by r values, so when processing all the
1551 * way to the right-hand edge, or within r pixels of that edge, the input will
1552 * run out first.  In the case of very narrow images, or sub-windows starting
1553 * near the right edge, the input may already have run out while the
1554 * convolution window was being filled and this loop will start with a
1555 * zero-length input.
1556 *
1557 * Once the input runs out, the rest of the output must be processed by padding
1558 * the remainder of the window with pad value from the last valid pixel from
1559 * the source.
1560 *
1561 * Input:
1562 *      x0 = dst
1563 *      x1 = src
1564 *      x2 = pitch
1565 *      x3 = count
1566 *      x4 = inlen
1567 *      x5 = r
1568 *      x6 = rup
1569 *      x7 = rdn
1570 *      x9 = buffer
1571 *      x13 = -pitch
1572 *      x15 = top-row in
1573 *      x19 = bottom-row in
1574 * Modifies
1575 *      x8 = fetch code pointer
1576 */
1577.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
1578
1579            /* If x4 >= x3 then there's no need for clipping.  The main loop
1580             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
1581             * no greater than x3 and use x4 for the loop.
1582             * However, if x4 comes out of the loop with less than 16 bytes
1583             * left, a partial read would be necessary to avoid reading beyond
1584             * the end of the image.  To avoid this, clamp x4 to the next
1585             * multiple of 16, which is still sufficient to force it out of the
1586             * loop but doesn't imply a rewind.
1587             */
1588            add         x12, x3, #15
1589            bic         x12, x12, #15
1590            cmp         x4, x12
1591            csel        x4, x12, x4, hi
1592
1593            /* First calculate the entry-point into the internal fetch logic.
1594             * This is done so the same function can service several kernel
1595             * sizes.
1596             */
1597            adrp        x8, \labelnc
1598            add         x8, x8, #:lo12:\labelnc
1599            sub         x8, x8, x5, LSL #5
1600            sub         x8, x8, x5, LSL #3
1601            cmp         x5, x6
1602            ccmp        x5, x7, #0, eq
1603            beq         5f
1604
1605            /* if (r != rup || r != rdn) then the address-clamping table should
1606             * be used rather than the short-cut version.
1607             */
1608            adrp        x8, \labelc
1609            add         x8, x8, #:lo12:\labelc
1610            sub         x8, x8, x5, LSL #6
1611            add         x8, x8, x5, LSL #3
1612            b           5f
1613
1614            /* Main loop: ... */
1615            .align  4
16163:          /* first perform a vertical convolution from memory to get the next
1617             * 16 taps of the horizontal window into the register file...
1618             */
1619            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
1620
1621            /* ...then perform a horizontal convolution on that window to
1622             * produce eight output bytes, and slide the window along.
1623             * This has to be done twice to match the 16-way vertical pass.
1624             * It would be preferable to have twice the work done in \core, but
1625             * that would demand yet another variant on those macros and would
1626             * perturb the register allocation severely.
1627             */
1628            \core
1629            st1         {v15.8b}, [x0], #8
1630            \core
1631            st1         {v15.8b}, [x0], #8
1632
1633            sub         x3, x3, #16
16345:          subs        x4, x4, #16
1635            bhi         3b
1636            /* Here there's 16 or fewer bytes available before the edge of the
1637             * source image.  x4 holds that count minus 16 (because it was
1638             * decremented before the first iteration ran).  The last read may
1639             * not be a whole chunk, and beyond that a fill value must be used.
1640             *
1641             * Of course, none of that matters if there's no more output to
1642             * produce...
1643             */
1644            cbz         x3, 5f
1645
1646            /* Oh well. */
1647            adds        x4, x4, #16
1648            bne         1f
1649  .if \step==1
1650            dup         v10.8h, v9.h[7]
1651            dup         v11.8h, v9.h[7]
1652  .else
1653            dup         v10.2d, v9.d[1]
1654            dup         v11.2d, v9.d[1]
1655  .endif
1656            b           3f
1657
1658            /* To avoid reading past end of input, rewind pointers by (16-x4)
1659             * to ensure that they're exactly 16 bytes from the edge.
1660             */
16611:          mov         x11, x4
1662            bl          fetch_clampright\step
1663            /* Now to put this padding to use, perform any remaining
1664             * iterations.  This is done at half the rate of the main loop,
1665             * because there's no longer pressure from a 16-lane window filler.
1666             */
16673:          \core
1668  .if \step==1
1669            dup         v11.8h, v11.h[7]
1670  .else
1671            dup         v11.2d, v11.d[1]
1672  .endif
1673            subs        x3, x3, #8
1674            blo         4f
1675            st1         {v15.8b}, [x0], #8
1676            bne         3b
1677            b           5f
1678
1679            /* If the final iteration contained 0 < l < 8 values, then perform
1680             * a piecewise store of the final vector.
1681             */
16824:          tbz         x3, #2, 1f
1683            st1         {v15.s}[0], [x0], #4
1684            ext         v15.8b, v15.8b, v15.8b, #4
16851:          tbz         x3, #1, 1f
1686            st1         {v15.h}[0], [x0], #2
1687            ext         v15.8b, v15.8b, v15.8b, #2
16881:          tbz         x3, #0, 5f
1689            st1         {v15.b}[0], [x0], #1
1690            ext         v15.8b, v15.8b, v15.8b, #1
16915:          mov         x0, #0
1692.endm
1693
1694
1695.irp r, TUNED_LIST1, 25
1696PRIVATE(convolve1_\r)
1697            stp         x29,x30, [sp, #-16]!
1698
1699            prefill     step=1, max_r=\r, label=.Lcnv1_\r
1700
1701            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
1702
1703            ldp         x29,x30, [sp], #16
1704            ret
1705END(convolve1_\r)
1706.endr
1707
1708.irp r, TUNED_LIST4, 25
1709PRIVATE(convolve4_\r)
1710            sub         x9, sp, #0x40
1711            stp         x29,x30, [sp, #-(16 + 0x40 + 0x80)]!
1712            bic         x9, x9, #0x7f
1713
1714            /* x9 now points to a 0x40 byte buffer on the stack whose address
1715             * has the low 7 bits clear.  This allows easy address calculation
1716             * in the wrap-around cases.
1717             */
1718
1719            prefill     step=4, max_r=\r, label=.Lcnv4_\r
1720
1721            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
1722
1723            ldp         x29,x30, [sp], #(16 + 0x40 + 0x80)
1724            ret
1725END(convolve4_\r)
1726.endr
1727
1728/* void rsdIntrinsicBlurU1_K(
1729 *                  void *out,      // x0
1730 *                  void *in,       // x1
1731 *                  size_t w,       // x2
1732 *                  size_t h,       // x3
1733 *                  size_t p,       // x4
1734 *                  size_t x,       // x5
1735 *                  size_t y,       // x6
1736 *                  size_t count,   // x7
1737 *                  size_t r,       // [sp]
1738 *                  uint16_t *tab); // [sp,#8]
1739 */
1740ENTRY(rsdIntrinsicBlurU1_K)
1741            stp         x19,x30, [sp, #-16]!
1742            sub         x8, sp, #32
1743            sub         sp, sp, #64
1744            st1         {v8.1d - v11.1d}, [sp]
1745            st1         {v12.1d - v15.1d}, [x8]
1746            mov         x8, x5          // x
1747            ldr         w5, [sp,#80]    // r
1748            sub         x9, x2, x8      // w - x
1749            sub         x10, x3, x6     // h - y
1750            mov         x2, x4          // pitch
1751            mov         x3, x7          // count
1752            sub         x7, x10, #1     // h - y - 1
1753            mov         x4, x9          // inlen = (w - x)
1754
1755            ldr         x12, [sp, #88] // tab
1756
1757            add         x1, x1, x8      // src += x
1758
1759            cmp         x6, x5
1760            csel        x6, x5, x6, hs  // rup = min(r, y)
1761            cmp         x7, x5
1762            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
1763
1764            sub         x13, xzr, x2    // -pitch
1765            msub        x15, x2, x6, x1
1766            madd        x19, x2, x7, x1
1767
1768            ld1         {v0.8h,v1.8h}, [x12], #32
1769            ld1         {v2.8h,v3.8h}, [x12], #32
1770
1771            adr         x30, 1f
1772  .irp r, TUNED_LIST1
1773            cmp         x5, #\r
1774            bls         convolve1_\r
1775  .endr
1776            b           convolve1_25
1777
17781:          ld1         {v8.1d - v11.1d}, [sp], #32
1779            ld1         {v12.1d - v15.1d}, [sp], #32
1780            ldp         x19,x30, [sp], #16
1781            ret
1782END(rsdIntrinsicBlurU1_K)
1783
1784/* void rsdIntrinsicBlurU4_K(
1785 *                  void *out,      // x0
1786 *                  void *in,       // x1
1787 *                  size_t w,       // x2
1788 *                  size_t h,       // x3
1789 *                  size_t p,       // x4
1790 *                  size_t x,       // x5
1791 *                  size_t y,       // x6
1792 *                  size_t count,   // x7
1793 *                  size_t r,       // [sp]
1794 *                  uint16_t *tab); // [sp,#8]
1795 */
1796ENTRY(rsdIntrinsicBlurU4_K)
1797            stp         x19,x30, [sp, #-16]!
1798            sub         x8, sp, #32
1799            sub         sp, sp, #64
1800            st1         {v8.1d - v11.1d}, [sp]
1801            st1         {v12.1d - v15.1d}, [x8]
1802            lsl         x8, x5, #2      // x
1803            lsl         x2, x2, #2
1804            ldr         w5, [sp,#80]    // r
1805            sub         x9, x2, x8      // w - x
1806            sub         x10, x3, x6     // h - y
1807            mov         x2, x4          // pitch
1808            lsl         x3, x7, #2      // count
1809            sub         x7, x10, #1     // h - y - 1
1810            mov         x4, x9          // inlen = (w - x)
1811
1812            ldr         x12, [sp, #88]
1813
1814            add         x1, x1, x8      // in += x
1815
1816            cmp         x6, x5
1817            csel        x6, x5, x6, hs  // rup = min(r, y)
1818            cmp         x7, x5
1819            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
1820
1821
1822            sub         x13, xzr, x2
1823            msub        x15, x2, x6, x1
1824            madd        x19, x2, x7, x1
1825
1826            ld1         {v0.8h,v1.8h}, [x12], #32
1827            ld1         {v2.8h,v3.8h}, [x12], #32
1828
1829            adr         x30, 1f
1830  .irp r, TUNED_LIST4
1831            cmp         x5, #\r
1832            bls         convolve4_\r
1833  .endr
1834            b           convolve4_25
1835
18361:          ld1         {v8.1d - v11.1d}, [sp], #32
1837            ld1         {v12.1d - v15.1d}, [sp], #32
1838            ldp         x19,x30, [sp], #16
1839            ret
1840END(rsdIntrinsicBlurU4_K)
1841